mattb512 commited on
Commit
b3d2785
1 Parent(s): c2711e7

add guidance, intermediary latents

Browse files
Files changed (2) hide show
  1. app.py +90 -41
  2. image_generator.py +27 -13
app.py CHANGED
@@ -2,17 +2,27 @@ import gradio as gr
2
  from image_generator import ImageGenerator
3
  import os
4
 
5
- ig = ImageGenerator(g=7.5)
 
 
 
 
 
 
 
 
 
 
6
  print(ig)
7
  ig.load_models()
8
  ig.load_scheduler()
9
 
10
- def call(prompt, mix_prompt, mix_ratio, negative_prompt, steps, init_image ):
11
 
12
- print(f"{prompt=} {mix_prompt=} {mix_ratio=} {negative_prompt=} {steps=} {init_image=} ")
13
  generated_image, latents = ig.generate(
14
  prompt=prompt,
15
- secondary_prompt=mix_prompt,
16
  prompt_mix_ratio=mix_ratio,
17
  negative_prompt=negative_prompt,
18
  steps=steps,
@@ -26,40 +36,79 @@ def call(prompt, mix_prompt, mix_ratio, negative_prompt, steps, init_image ):
26
 
27
  return generated_image, noisy_latent
28
 
29
- iface = gr.Interface(
30
- fn=call,
31
- inputs=[
32
- gr.Textbox(value="a cute dog", label="Prompt", info="primary prompt used to generate an image"),
33
- gr.Textbox(value=None, label="Secondary Prompt", info="secondary prompt to mix with the primary embeddings"),
34
- gr.Slider(0, 1, value=0.5, label="Mix Ratio", info="mix ratio between primary and secondary prompt. 0 = primary only. 1 = secondary only"),
35
- gr.Textbox(value=None, label="Negative Prompt", info="remove certain aspect from the picture"),
36
- gr.Slider(10, 50, value=30, step=1, label="Generation Steps", info="How many steps are used to generate the picture"),
37
- gr.Image(type="pil", value=None, label="Starting Image",), # info="starting image from this image as opposed to random noise"
38
- ],
39
- outputs=[
40
- gr.Image(type="pil", label="Generated Image",),
41
- gr.Image(type="pil", label="Starting Image with Added Noise" ) ],
42
- examples=[
43
- # simple prompt
44
- ["a cute dog", "", 0.3, "", 5, None],
45
-
46
- # # negative prompt
47
- # ["a beautiful tree", None, None, "green", 5, None],
48
-
49
- # ["a dancer, high resolution, 4k", None, None, None, 5, None],
50
-
51
- # # with base image
52
- # ["a painting of Paris at night in the style of Monet", None, None, None, 5, os.path.join( os.path.dirname(__file__), "examples/ex4.jpg")],
53
-
54
- # ["p1", None, 0.3, None, 5, None],
55
- # ["p1", None, 0.3, None, 5, None],
56
- # ["p1", None, 0.3, None, 5, None],
57
- # ["p1", None, 0.3, None, 5, None],
58
- # ["p1", None, 0.3, None, 5, None],
59
- # ["p1", None, 0.3, None, 5, None],
60
- # ["p1", None, 0.3, None, 5, None],
61
- ]
62
- )
63
-
64
- # [(os.path.join(os.path.dirname(__file__), f"examples/ex{x}.jpg")) for x in range(1,11)]
65
- iface.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  from image_generator import ImageGenerator
3
  import os
4
 
5
+
6
+ header = """Hi! This HuggingFace Space is a demo for the homework from the [10th lesson](https://course.fast.ai/Lessons/lesson10.html) of the fast.ai course. You can pick some of the examples below and click the "Generate Image" Button.
7
+
8
+ The code demonstrates:
9
+ * how to use an existing image as a starting point for the output image generation, in addition to the prompt
10
+ * how to use negative prompt
11
+ * how to capture latents through the generation
12
+ * how to mix prompt embeddings"""
13
+
14
+
15
+ ig = ImageGenerator()
16
  print(ig)
17
  ig.load_models()
18
  ig.load_scheduler()
19
 
20
+ def call(prompt, secondary_prompt, mix_ratio, negative_prompt, steps, init_image ):
21
 
22
+ print(f"{prompt=} {secondary_prompt=} {mix_ratio=} {negative_prompt=} {steps=} {init_image=} ")
23
  generated_image, latents = ig.generate(
24
  prompt=prompt,
25
+ secondary_prompt=secondary_prompt,
26
  prompt_mix_ratio=mix_ratio,
27
  negative_prompt=negative_prompt,
28
  steps=steps,
 
36
 
37
  return generated_image, noisy_latent
38
 
39
+ def update_noisy_image_visibility(init_image):
40
+ if init_image is None:
41
+ print("update_noisy_image_visibility: hide noisy image")
42
+ return gr.Image(type="pil", label="Starting Image with Added Noise", visible=False)
43
+ else:
44
+ print("update_noisy_image_visibility: show noisy image")
45
+ return gr.Image(type="pil", label="Starting Image with Added Noise", visible=True)
46
+
47
+ def run_inference(prompt="", secondary_prompt="", mix_ratio=0.5, negative_prompt="", guidance=7.5, steps=10, init_image=None, progress=gr.Progress()): #, mix_ratio, negative_prompt, steps, starting_image, load_set_btn,
48
+ print(f"{prompt=} {secondary_prompt=} {mix_ratio=} {negative_prompt=} {steps=} {init_image=} ")
49
+
50
+ generated_image, latents = ig.generate(
51
+ prompt=prompt,
52
+ secondary_prompt=secondary_prompt,
53
+ prompt_mix_ratio=mix_ratio,
54
+ negative_prompt=negative_prompt,
55
+ guidance=guidance,
56
+ steps=steps,
57
+ init_image=init_image,
58
+ latent_callback_mod=1,
59
+ progress_tqdm=progress.tqdm )
60
+
61
+ if init_image is not None:
62
+ noisy_latent = latents[1]
63
+ else:
64
+ noisy_latent = None
65
+
66
+ return generated_image, noisy_latent, ig.image_grid(latents)
67
+
68
+ with gr.Blocks() as demo:
69
+ with gr.Row():
70
+ gr.Markdown(value=header)
71
+ with gr.Row():
72
+ with gr.Column(scale=1):
73
+ prompt = gr.Textbox(value="a cute dog", label="Prompt", info="primary prompt used to generate an image")
74
+ secondary_prompt = gr.Textbox(value=None, label="Secondary Prompt", info="secondary prompt to mix with the primary embeddings")
75
+ mix_ratio = gr.Slider(0, 1, value=0.5, label="Mix Ratio", info="mix ratio between primary and secondary prompt. 0 = primary only. 1 = secondary only")
76
+ negative_prompt = gr.Textbox(value=None, label="Negative Prompt", info="remove certain aspect from the picture")
77
+ guidance = gr.Slider(0, 14, value=7.5, label="Guidance", info="how closely the model should follow the prompt (higher the closer)")
78
+ steps = gr.Slider(10, 50, value=10, step=1, label="Generation Steps", info="How many steps are used to generate the picture")
79
+ init_image = gr.Image(type="pil", value=None, label="Starting Image",) # info="starting image from this image as opposed to random noise"
80
+ generate_image_btn = gr.Button("Generate Image")
81
+
82
+ with gr.Column(scale=1):
83
+ output_image = gr.Image(type="pil", label="Generated Image",)
84
+ noisy_image = gr.Image(type="pil", label="Starting Image with Added Noise", visible=False)
85
+ noisy_image.change(fn=update_noisy_image_visibility, inputs=init_image, outputs=noisy_image)
86
+ latent_images = gr.Image(type="pil", label="Latents through the denoising process", visible=True)
87
+
88
+ with gr.Row():
89
+ # broken images should be fixed soon https://github.com/gradio-app/gradio/issues/5067
90
+ gr.Examples(
91
+ examples=[
92
+ # simple prompt
93
+ ["a cute dog", "", "", "", 7.5, 10, None],
94
+
95
+ # negative prompt
96
+ ["a beautiful tree", "", "", "green", 7.5, 10, None],
97
+
98
+ # with base image
99
+ ["a painting of Paris at night in the style of Pierre Auguste Renoir", "", "", "", 7.5, 50, os.path.join( os.path.dirname(__file__), "examples/ex4.jpg")],
100
+
101
+ # with prompt
102
+ ["a sloth", "a jaguar", 0.5, "", 7.5, 30, None],
103
+ ],
104
+ inputs=[prompt, secondary_prompt, mix_ratio, negative_prompt, guidance, steps, init_image],
105
+ outputs=[output_image, noisy_image, latent_images],
106
+ fn=run_inference,
107
+ cache_examples=False)
108
+
109
+ generate_image_btn.click(
110
+ fn=run_inference,
111
+ inputs=[prompt, secondary_prompt, mix_ratio, negative_prompt, guidance, steps, init_image],
112
+ outputs=[output_image, noisy_image, latent_images])
113
+
114
+ demo.launch()
image_generator.py CHANGED
@@ -19,11 +19,8 @@ from tqdm.auto import tqdm
19
 
20
  logging.disable(logging.WARNING)
21
  class ImageGenerator():
22
- def __init__(self,
23
- g:int=7.5,
24
- ):
25
  self.latent_images = []
26
- self.g = g
27
  self.width = 512
28
  self.height = 512
29
  self.generator = torch.manual_seed(32)
@@ -31,12 +28,23 @@ class ImageGenerator():
31
  if torch.cuda.is_available():
32
  self.device = torch.device("cuda")
33
  self.float_size = torch.float16
 
 
 
34
  else:
 
 
 
 
 
 
35
  self.device = torch.device("cpu")
36
  self.float_size = torch.float32
 
 
37
 
38
  def __repr__(self):
39
- return f"Image Generator with {self.g=}"
40
 
41
  def load_models(self):
42
  self.tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14", torch_dtype=self.float_size)
@@ -48,7 +56,6 @@ class ImageGenerator():
48
  def load_scheduler( self,
49
  beta_start : float=0.00085,
50
  beta_end : float=0.012,
51
- beta_schedule : str="scaled_linear",
52
  num_train_timesteps :int=1000):
53
 
54
  self.scheduler = LMSDiscreteScheduler(
@@ -63,10 +70,11 @@ class ImageGenerator():
63
 
64
  def pil_to_latent(self, image: Image) -> torch.Tensor:
65
  with torch.no_grad():
 
66
  np_img = np.transpose( (( np.array(image) / 255)-0.5)*2, (2,0,1)) # turn pil image into np array with values between -1 and 1
67
  # print(f"{np_img.shape=}") # 4, 64, 64
68
 
69
- np_images = np.repeat(np_img[np.newaxis, :, :], self.bs, axis=0) # adding a new dimension and repeating the image for each prompt
70
  # print(f"{np_images.shape=}")
71
 
72
  decoded_latent = torch.from_numpy(np_images).to(self.device).float() #<-- stability-ai vae uses half(), compvis vae uses float?
@@ -96,6 +104,7 @@ class ImageGenerator():
96
  return Image.fromarray((image*255).round().astype("uint8"))
97
 
98
  def image_grid(self, imgs: [Image]) -> Image:
 
99
  w,h = imgs[0].size
100
  cols = len(imgs)
101
  grid = Image.new('RGB', size=(cols*w, h))
@@ -125,21 +134,25 @@ class ImageGenerator():
125
  self.latent_images.append(self.tensor_to_pil(decoded))
126
 
127
  def generate(self,
128
- prompt : str,
129
  secondary_prompt: str=None,
130
  prompt_mix_ratio : float=0.5,
131
  negative_prompt="",
132
  seed : int=32,
 
133
  steps : int=30,
134
  start_step_ratio : float=1/5,
135
  init_image : Image=None,
136
- latent_callback_mod : int=10):
 
137
  self.latent_images = []
138
  if not negative_prompt: negative_prompt = ""
139
-
 
140
  with torch.no_grad():
141
  text = self.text_enc(prompt)
142
  if secondary_prompt:
 
143
  sec_prompt_text = self.text_enc(secondary_prompt)
144
  text = text * prompt_mix_ratio + sec_prompt_text * ( 1 - prompt_mix_ratio )
145
  uncond = self.text_enc(negative_prompt * self.bs, text.shape[1])
@@ -157,21 +170,22 @@ class ImageGenerator():
157
  latents = latents * self.scheduler.init_noise_sigma
158
  # print(f"{latents.shape=}")
159
  else:
 
160
  start_steps = int(steps * start_step_ratio) # 0%: too much noise, 100% no noise
161
  # print(f"{start_steps=}")
162
- latents =self. pil_to_latent(init_image)
163
  self.latent_callback(latents)
164
  latents = self.add_noise(latents, start_steps).to(self.device).float()
165
  self.latent_callback(latents)
166
 
167
  latents = latents.to(self.device).float()
168
 
169
- for i,ts in enumerate(tqdm(self.scheduler.timesteps, leave=False)):
170
  if i >= start_steps:
171
  inp = self.scheduler.scale_model_input(torch.cat([latents] * 2), ts)
172
  with torch.no_grad():
173
  u,t = self.unet(inp, ts, encoder_hidden_states=emb).sample.chunk(2) #todo, grab those with callbacks
174
- pred = u + self.g*(t-u)
175
  # pred = u + self.g*(t-u)/torch.norm(t-u)*torch.norm(u)
176
  latents = self.scheduler.step(pred, ts, latents).prev_sample
177
 
 
19
 
20
  logging.disable(logging.WARNING)
21
  class ImageGenerator():
22
+ def __init__(self):
 
 
23
  self.latent_images = []
 
24
  self.width = 512
25
  self.height = 512
26
  self.generator = torch.manual_seed(32)
 
28
  if torch.cuda.is_available():
29
  self.device = torch.device("cuda")
30
  self.float_size = torch.float16
31
+ elif torch.backends.mps.is_available():
32
+ self.device = torch.device("mps")
33
+ self.float_size = torch.float32
34
  else:
35
+ if not torch.backends.mps.is_built():
36
+ print("MPS not available because the current PyTorch install was not "
37
+ "built with MPS enabled.")
38
+ else:
39
+ print("MPS not available because the current MacOS version is not 12.3+ "
40
+ "and/or you do not have an MPS-enabled device on this machine.")
41
  self.device = torch.device("cpu")
42
  self.float_size = torch.float32
43
+
44
+ print(f"pytorch device: {self.device}")
45
 
46
  def __repr__(self):
47
+ return f"Image Generator with {self.width=} {self.height=}"
48
 
49
  def load_models(self):
50
  self.tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14", torch_dtype=self.float_size)
 
56
  def load_scheduler( self,
57
  beta_start : float=0.00085,
58
  beta_end : float=0.012,
 
59
  num_train_timesteps :int=1000):
60
 
61
  self.scheduler = LMSDiscreteScheduler(
 
70
 
71
  def pil_to_latent(self, image: Image) -> torch.Tensor:
72
  with torch.no_grad():
73
+ image = image.resize(size=(self.width,self.height))
74
  np_img = np.transpose( (( np.array(image) / 255)-0.5)*2, (2,0,1)) # turn pil image into np array with values between -1 and 1
75
  # print(f"{np_img.shape=}") # 4, 64, 64
76
 
77
+ np_images = np.repeat(np_img[np.newaxis, :, :], self.bs, axis=0).astype(np.float32) # adding a new dimension and repeating the image for each prompt, float32 required for mac
78
  # print(f"{np_images.shape=}")
79
 
80
  decoded_latent = torch.from_numpy(np_images).to(self.device).float() #<-- stability-ai vae uses half(), compvis vae uses float?
 
104
  return Image.fromarray((image*255).round().astype("uint8"))
105
 
106
  def image_grid(self, imgs: [Image]) -> Image:
107
+ print(len(imgs))
108
  w,h = imgs[0].size
109
  cols = len(imgs)
110
  grid = Image.new('RGB', size=(cols*w, h))
 
134
  self.latent_images.append(self.tensor_to_pil(decoded))
135
 
136
  def generate(self,
137
+ prompt : str="",
138
  secondary_prompt: str=None,
139
  prompt_mix_ratio : float=0.5,
140
  negative_prompt="",
141
  seed : int=32,
142
+ guidance :float=7.5,
143
  steps : int=30,
144
  start_step_ratio : float=1/5,
145
  init_image : Image=None,
146
+ latent_callback_mod : int=10,
147
+ progress_tqdm: callable=tqdm):
148
  self.latent_images = []
149
  if not negative_prompt: negative_prompt = ""
150
+ print(f"ImageGenerator: {prompt=} {secondary_prompt=} {prompt_mix_ratio=} {negative_prompt=} {guidance=} {steps=} {init_image=} ")
151
+
152
  with torch.no_grad():
153
  text = self.text_enc(prompt)
154
  if secondary_prompt:
155
+ print("using secondary prompt")
156
  sec_prompt_text = self.text_enc(secondary_prompt)
157
  text = text * prompt_mix_ratio + sec_prompt_text * ( 1 - prompt_mix_ratio )
158
  uncond = self.text_enc(negative_prompt * self.bs, text.shape[1])
 
170
  latents = latents * self.scheduler.init_noise_sigma
171
  # print(f"{latents.shape=}")
172
  else:
173
+ print("using base image")
174
  start_steps = int(steps * start_step_ratio) # 0%: too much noise, 100% no noise
175
  # print(f"{start_steps=}")
176
+ latents =self.pil_to_latent(init_image)
177
  self.latent_callback(latents)
178
  latents = self.add_noise(latents, start_steps).to(self.device).float()
179
  self.latent_callback(latents)
180
 
181
  latents = latents.to(self.device).float()
182
 
183
+ for i,ts in enumerate(progress_tqdm(self.scheduler.timesteps, desc="Latent Generation")): #leave=False, does not work with gradio
184
  if i >= start_steps:
185
  inp = self.scheduler.scale_model_input(torch.cat([latents] * 2), ts)
186
  with torch.no_grad():
187
  u,t = self.unet(inp, ts, encoder_hidden_states=emb).sample.chunk(2) #todo, grab those with callbacks
188
+ pred = u + guidance*(t-u)
189
  # pred = u + self.g*(t-u)/torch.norm(t-u)*torch.norm(u)
190
  latents = self.scheduler.step(pred, ts, latents).prev_sample
191