Spaces:
Runtime error
Runtime error
Added Style embeddings
Browse files- app.py +58 -29
- main_inference.py +5 -28
- requirements.txt +12 -8
- style_embeddings/3d_female_cyborgs.bin +0 -0
- style_embeddings/80s_anime.bin +0 -0
- style_embeddings/anders_zorn.bin +0 -0
- style_embeddings/angus_mcbride.bin +0 -0
- style_embeddings/breakcore.bin +0 -0
- style_embeddings/brittney_williams.bin +0 -0
- style_embeddings/buhu_art_style.bin +0 -0
- style_embeddings/bull_vs_bear.bin +0 -0
- style_embeddings/caitlin_fairchild.bin +0 -0
- style_embeddings/exodus_styling.bin +0 -0
- style_embeddings/foorbyv2.bin +0 -0
- style_embeddings/learned_embeds_arcane.bin +1 -0
- style_embeddings/loaded_style_empire.bin +0 -0
- style_embeddings/useembeddingname-neg.bin +0 -0
- style_embeddings/useembeddingname.bin +0 -0
- style_guidence.py +176 -0
app.py
CHANGED
@@ -1,34 +1,48 @@
|
|
1 |
import gradio as gr
|
2 |
from torchvision import transforms
|
3 |
import torch
|
4 |
-
from main_inference import generate_mixed_image, generate_image
|
|
|
5 |
import matplotlib.colors as mcolors
|
6 |
|
7 |
-
|
8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
return image
|
10 |
|
11 |
-
def run_generate_image(prompt1,noise_checkbox):
|
12 |
-
image = generate_image(prompt1,noised_image=noise_checkbox)
|
13 |
return image
|
14 |
|
15 |
-
def run_generate_image_with_color_doninance(prompt1,color,color_loss_scale,noised_image_checkbox_1):
|
16 |
# Convert the hexadecimal color code to RGB values
|
17 |
rgba_color = mcolors.hex2color(color)
|
18 |
# Multiply the RGB values by 255 to get them in the [0, 255] range
|
19 |
rgb_values = [int(val * 255) for val in rgba_color]
|
20 |
-
image = generate_image(prompt1,True,
|
21 |
return image
|
22 |
-
|
23 |
-
def
|
24 |
-
|
25 |
-
|
26 |
-
return video
|
27 |
|
28 |
|
29 |
description_text_to_image = """ ### Text to Image Generation
|
30 |
|
31 |
-
1. Write a Text Prompt.
|
32 |
|
33 |
2. Output will be an image based on the text prompt provided.
|
34 |
|
@@ -38,7 +52,7 @@ description_text_to_image = """ ### Text to Image Generation
|
|
38 |
|
39 |
description_generate_mixed_image = """ ### Mix Image Generation
|
40 |
|
41 |
-
1. Write Two Text prompts.
|
42 |
|
43 |
2. Output will a image which is mix of both of the text provided.
|
44 |
|
@@ -48,7 +62,7 @@ description_generate_mixed_image = """ ### Mix Image Generation
|
|
48 |
|
49 |
description_generate_image_with_color_dominance = """ ### Generate Images with color dominance
|
50 |
|
51 |
-
1. Write a Text Prompt.
|
52 |
|
53 |
2. Select a color
|
54 |
|
@@ -60,11 +74,13 @@ description_generate_image_with_color_dominance = """ ### Generate Images with c
|
|
60 |
|
61 |
"""
|
62 |
|
63 |
-
|
64 |
|
65 |
-
1. Write a Text Prompt.
|
66 |
|
67 |
-
2.
|
|
|
|
|
68 |
|
69 |
"""
|
70 |
|
@@ -75,18 +91,26 @@ image_input1 = gr.Image(type='pil')
|
|
75 |
image_input2 = gr.Image(type='filepath')
|
76 |
image_input3 = gr.Image(type='pil')
|
77 |
image_input4 = gr.Image(type='pil')
|
|
|
78 |
text_input = gr.Text(label="Enter Text Prompt")
|
79 |
text_input2 = gr.Text(label="Enter Text Prompt")
|
80 |
text_input3 = gr.Text(label="Enter Text Prompt")
|
81 |
text_input4 = gr.Text(label = "Enter Text Prompt")
|
82 |
text_input5 = gr.Text(label = "Enter Text Prompt")
|
83 |
-
|
|
|
|
|
|
|
|
|
84 |
|
85 |
color = gr.ColorPicker(label="Select a Color",description="Choose a color from the color picker:")
|
86 |
noised_image_checkbox = gr.inputs.Checkbox(default=False, label="Show Noised Image")
|
87 |
noised_image_checkbox_1 = gr.inputs.Checkbox(default=False, label="Show Noised Image")
|
88 |
noised_image_checkbox_2 = gr.inputs.Checkbox(default=False, label="Show Noised Image")
|
|
|
89 |
color_loss_scale = gr.inputs.Slider(minimum=0, maximum=255, default=40, step=1,label="Color Loss")
|
|
|
|
|
90 |
css = "h1 { text-align: center } .about { text-align: justify; padding-left: 10%; padding-right: 10%; }"
|
91 |
|
92 |
with gr.Blocks(css=css, title='Play with Stable Diffusion') as demo:
|
@@ -100,6 +124,7 @@ with gr.Blocks(css=css, title='Play with Stable Diffusion') as demo:
|
|
100 |
with gr.Row(variant="panel"):
|
101 |
with gr.Column(scale=1):
|
102 |
text_input.render()
|
|
|
103 |
noised_image_checkbox.render()
|
104 |
with gr.Column(scale=1):
|
105 |
image_input1.render()
|
@@ -118,7 +143,7 @@ with gr.Blocks(css=css, title='Play with Stable Diffusion') as demo:
|
|
118 |
# examples_per_page=4)
|
119 |
|
120 |
run_generate_image_button.click(run_generate_image,
|
121 |
-
inputs=[text_input,noised_image_checkbox],
|
122 |
outputs=image_input1)
|
123 |
|
124 |
with gr.Tab("Generate Image with Color Dominance"):
|
@@ -127,6 +152,7 @@ with gr.Blocks(css=css, title='Play with Stable Diffusion') as demo:
|
|
127 |
with gr.Column(scale=1):
|
128 |
text_input4.render()
|
129 |
color_loss_scale.render()
|
|
|
130 |
noised_image_checkbox_1.render()
|
131 |
color.render()
|
132 |
with gr.Column(scale=1):
|
@@ -146,7 +172,7 @@ with gr.Blocks(css=css, title='Play with Stable Diffusion') as demo:
|
|
146 |
# examples_per_page=4)
|
147 |
|
148 |
run_generate_image_with_color_doninance_button.click(run_generate_image_with_color_doninance,
|
149 |
-
inputs=[text_input4,color,color_loss_scale,noised_image_checkbox_1],
|
150 |
outputs=image_input3)
|
151 |
|
152 |
####################################################################################################################
|
@@ -156,6 +182,7 @@ with gr.Blocks(css=css, title='Play with Stable Diffusion') as demo:
|
|
156 |
with gr.Column(scale=1):
|
157 |
text_input2.render()
|
158 |
text_input3.render()
|
|
|
159 |
noised_image_checkbox_2.render()
|
160 |
with gr.Column(scale=1):
|
161 |
image_input4.render()
|
@@ -175,18 +202,20 @@ with gr.Blocks(css=css, title='Play with Stable Diffusion') as demo:
|
|
175 |
# examples_per_page=4)
|
176 |
|
177 |
run_generate_mixed_image_button.click(run_generate_mixed_image,
|
178 |
-
inputs=[text_input2,text_input3,noised_image_checkbox_2],
|
179 |
outputs=image_input4)
|
180 |
|
181 |
####################################################################################################################
|
182 |
-
with gr.Tab("
|
183 |
# Images
|
184 |
with gr.Row(variant="panel"):
|
185 |
with gr.Column(scale=1):
|
186 |
text_input5.render()
|
|
|
|
|
187 |
|
188 |
with gr.Column(scale=1):
|
189 |
-
|
190 |
|
191 |
# Submit & Clear
|
192 |
with gr.Row():
|
@@ -194,18 +223,18 @@ with gr.Blocks(css=css, title='Play with Stable Diffusion') as demo:
|
|
194 |
run_progress_video_button = gr.Button("progress_video", variant='primary')
|
195 |
clear_btn_progress_video = gr.Button("Clear", variant="secondary")
|
196 |
|
197 |
-
gr.Markdown(
|
198 |
# gr.Examples(examples = ["examples/12830823_87d2654e31.jpg", "examples/27782020_4dab210360.jpg", "examples/44129946_9eeb385d77.jpg"],
|
199 |
# inputs=[text_input5],
|
200 |
-
# outputs=
|
201 |
# fn=run_process_video,
|
202 |
# examples_per_page=4)
|
203 |
|
204 |
-
run_progress_video_button.click(
|
205 |
inputs=[
|
206 |
-
text_input5,
|
207 |
],
|
208 |
-
outputs=
|
209 |
|
210 |
#######################################################################################################################
|
211 |
#######################################################################################################################
|
|
|
1 |
import gradio as gr
|
2 |
from torchvision import transforms
|
3 |
import torch
|
4 |
+
from main_inference import generate_mixed_image, generate_image
|
5 |
+
from style_guidence import generate_with_prompt_style
|
6 |
import matplotlib.colors as mcolors
|
7 |
|
8 |
+
|
9 |
+
style_file_maps = {
|
10 |
+
'3D Female Cyborgs':"style_embeddings/3d_female_cyborgs.bin",
|
11 |
+
'80s Anime':"style_embeddings/80s_anime.bin",
|
12 |
+
'Anders Zorn':"style_embeddings/anders_zorn.bin",
|
13 |
+
"Angus Mcbride":"style_embeddings/angus_mcbride.bin",
|
14 |
+
"Breack Core":"style_embeddings/breakcore.bin",
|
15 |
+
"Brittney Williams":"style_embeddings/brittney_williams.bin",
|
16 |
+
"Bull vs Bear":"style_embeddings/bull_vs_bear.bin",
|
17 |
+
"Caitlin FairChild":"style_embeddings/caitlin_fairchild.bin",
|
18 |
+
"Exodus Styling":"style_embeddings/exodus_styling.bin",
|
19 |
+
"FoorByv2":"style_embeddings/foorbyv2.bin"
|
20 |
+
}
|
21 |
+
|
22 |
+
def run_generate_mixed_image(prompt1,prompt2,num_of_inf_steps,noised_image):
|
23 |
+
image = generate_mixed_image(prompt1,prompt2,num_of_inf_steps,noised_image)
|
24 |
return image
|
25 |
|
26 |
+
def run_generate_image(prompt1,num_of_inf_steps,noise_checkbox):
|
27 |
+
image = generate_image(prompt1,num_inference_steps=num_of_inf_steps,noised_image=noise_checkbox)
|
28 |
return image
|
29 |
|
30 |
+
def run_generate_image_with_color_doninance(prompt1,color,color_loss_scale,num_of_inf_steps,noised_image_checkbox_1):
|
31 |
# Convert the hexadecimal color code to RGB values
|
32 |
rgba_color = mcolors.hex2color(color)
|
33 |
# Multiply the RGB values by 255 to get them in the [0, 255] range
|
34 |
rgb_values = [int(val * 255) for val in rgba_color]
|
35 |
+
image = generate_image(prompt1,num_of_inf_steps,True,rgb_values,color_loss_scale,noised_image_checkbox_1)
|
36 |
return image
|
37 |
+
|
38 |
+
def run_generate_image_with_style(prompt,style,num_of_inf_steps):
|
39 |
+
output = generate_with_prompt_style(prompt, style_file_maps[style],num_of_inf_steps)
|
40 |
+
return output
|
|
|
41 |
|
42 |
|
43 |
description_text_to_image = """ ### Text to Image Generation
|
44 |
|
45 |
+
1. Write a Text Prompt and number of inference steps, the more the better results but execution time will be high.
|
46 |
|
47 |
2. Output will be an image based on the text prompt provided.
|
48 |
|
|
|
52 |
|
53 |
description_generate_mixed_image = """ ### Mix Image Generation
|
54 |
|
55 |
+
1. Write Two Text prompts and number of inference steps, the more the better results but execution time will be high.
|
56 |
|
57 |
2. Output will a image which is mix of both of the text provided.
|
58 |
|
|
|
62 |
|
63 |
description_generate_image_with_color_dominance = """ ### Generate Images with color dominance
|
64 |
|
65 |
+
1. Write a Text Prompt and number of inference steps, the more the better results but execution time will be high.
|
66 |
|
67 |
2. Select a color
|
68 |
|
|
|
74 |
|
75 |
"""
|
76 |
|
77 |
+
description_generate_prompt_with_style = """ ### Get a generated image in the selection of your style
|
78 |
|
79 |
+
1. Write a Text Prompt and number of inference steps, the more the better results but execution time will be high.
|
80 |
|
81 |
+
2. Select a style to dominate the photo
|
82 |
+
|
83 |
+
3. Get the Output
|
84 |
|
85 |
"""
|
86 |
|
|
|
91 |
image_input2 = gr.Image(type='filepath')
|
92 |
image_input3 = gr.Image(type='pil')
|
93 |
image_input4 = gr.Image(type='pil')
|
94 |
+
image_input5 = gr.Image(type='pil')
|
95 |
text_input = gr.Text(label="Enter Text Prompt")
|
96 |
text_input2 = gr.Text(label="Enter Text Prompt")
|
97 |
text_input3 = gr.Text(label="Enter Text Prompt")
|
98 |
text_input4 = gr.Text(label = "Enter Text Prompt")
|
99 |
text_input5 = gr.Text(label = "Enter Text Prompt")
|
100 |
+
|
101 |
+
num_of_inf_steps_slider1 = gr.inputs.Slider(minimum=0, maximum=50, default=30, step=1,label="Num of Inference Steps")
|
102 |
+
num_of_inf_steps_slider2 = gr.inputs.Slider(minimum=0, maximum=50, default=30, step=1,label="Num of Inference Steps")
|
103 |
+
num_of_inf_steps_slider3 = gr.inputs.Slider(minimum=0, maximum=50, default=30, step=1,label="Num of Inference Steps")
|
104 |
+
num_of_inf_steps_slider4 = gr.inputs.Slider(minimum=0, maximum=50, default=30, step=1,label="Num of Inference Steps")
|
105 |
|
106 |
color = gr.ColorPicker(label="Select a Color",description="Choose a color from the color picker:")
|
107 |
noised_image_checkbox = gr.inputs.Checkbox(default=False, label="Show Noised Image")
|
108 |
noised_image_checkbox_1 = gr.inputs.Checkbox(default=False, label="Show Noised Image")
|
109 |
noised_image_checkbox_2 = gr.inputs.Checkbox(default=False, label="Show Noised Image")
|
110 |
+
noised_image_checkbox_3 = gr.inputs.Checkbox(default=False, label="Show Noised Image")
|
111 |
color_loss_scale = gr.inputs.Slider(minimum=0, maximum=255, default=40, step=1,label="Color Loss")
|
112 |
+
style_options = ['3D Female Cyborgs', '80s Anime','Anders Zorn',"Angus Mcbride","Breack Core", "Brittney Williams","Bull vs Bear","Caitlin FairChild","Exodus Styling","FoorByv2"]
|
113 |
+
selected_style = gr.Dropdown(style_options,label="Select a Style to Follow",default="Anders Zorn")
|
114 |
css = "h1 { text-align: center } .about { text-align: justify; padding-left: 10%; padding-right: 10%; }"
|
115 |
|
116 |
with gr.Blocks(css=css, title='Play with Stable Diffusion') as demo:
|
|
|
124 |
with gr.Row(variant="panel"):
|
125 |
with gr.Column(scale=1):
|
126 |
text_input.render()
|
127 |
+
num_of_inf_steps_slider1.render()
|
128 |
noised_image_checkbox.render()
|
129 |
with gr.Column(scale=1):
|
130 |
image_input1.render()
|
|
|
143 |
# examples_per_page=4)
|
144 |
|
145 |
run_generate_image_button.click(run_generate_image,
|
146 |
+
inputs=[text_input,num_of_inf_steps_slider1,noised_image_checkbox],
|
147 |
outputs=image_input1)
|
148 |
|
149 |
with gr.Tab("Generate Image with Color Dominance"):
|
|
|
152 |
with gr.Column(scale=1):
|
153 |
text_input4.render()
|
154 |
color_loss_scale.render()
|
155 |
+
num_of_inf_steps_slider2.render()
|
156 |
noised_image_checkbox_1.render()
|
157 |
color.render()
|
158 |
with gr.Column(scale=1):
|
|
|
172 |
# examples_per_page=4)
|
173 |
|
174 |
run_generate_image_with_color_doninance_button.click(run_generate_image_with_color_doninance,
|
175 |
+
inputs=[text_input4,color,color_loss_scale,num_of_inf_steps_slider2,noised_image_checkbox_1],
|
176 |
outputs=image_input3)
|
177 |
|
178 |
####################################################################################################################
|
|
|
182 |
with gr.Column(scale=1):
|
183 |
text_input2.render()
|
184 |
text_input3.render()
|
185 |
+
num_of_inf_steps_slider3.render()
|
186 |
noised_image_checkbox_2.render()
|
187 |
with gr.Column(scale=1):
|
188 |
image_input4.render()
|
|
|
202 |
# examples_per_page=4)
|
203 |
|
204 |
run_generate_mixed_image_button.click(run_generate_mixed_image,
|
205 |
+
inputs=[text_input2,text_input3,num_of_inf_steps_slider3,noised_image_checkbox_2],
|
206 |
outputs=image_input4)
|
207 |
|
208 |
####################################################################################################################
|
209 |
+
with gr.Tab("Generate Image with Style"):
|
210 |
# Images
|
211 |
with gr.Row(variant="panel"):
|
212 |
with gr.Column(scale=1):
|
213 |
text_input5.render()
|
214 |
+
num_of_inf_steps_slider4.render()
|
215 |
+
selected_style.render()
|
216 |
|
217 |
with gr.Column(scale=1):
|
218 |
+
image_input5.render()
|
219 |
|
220 |
# Submit & Clear
|
221 |
with gr.Row():
|
|
|
223 |
run_progress_video_button = gr.Button("progress_video", variant='primary')
|
224 |
clear_btn_progress_video = gr.Button("Clear", variant="secondary")
|
225 |
|
226 |
+
gr.Markdown(description_generate_prompt_with_style)
|
227 |
# gr.Examples(examples = ["examples/12830823_87d2654e31.jpg", "examples/27782020_4dab210360.jpg", "examples/44129946_9eeb385d77.jpg"],
|
228 |
# inputs=[text_input5],
|
229 |
+
# outputs=image_input5,
|
230 |
# fn=run_process_video,
|
231 |
# examples_per_page=4)
|
232 |
|
233 |
+
run_progress_video_button.click(run_generate_image_with_style,
|
234 |
inputs=[
|
235 |
+
text_input5,selected_style,num_of_inf_steps_slider4
|
236 |
],
|
237 |
+
outputs=image_input5)
|
238 |
|
239 |
#######################################################################################################################
|
240 |
#######################################################################################################################
|
main_inference.py
CHANGED
@@ -10,8 +10,6 @@ from torchvision import transforms as tfms
|
|
10 |
import shutil
|
11 |
# For video display:
|
12 |
import cv2
|
13 |
-
from IPython.display import HTML
|
14 |
-
from base64 import b64encode
|
15 |
import os
|
16 |
from utils import color_loss,latents_to_pil,pil_to_latent,sketch_loss
|
17 |
# Set device
|
@@ -34,11 +32,11 @@ unet = unet.to(torch_device)
|
|
34 |
|
35 |
scheduler.set_timesteps(15)
|
36 |
|
37 |
-
def generate_mixed_image(prompt1, prompt2,noised_image=False):
|
38 |
mix_factor = 0.4 #@param
|
39 |
height = 512 # default height of Stable Diffusion
|
40 |
width = 512 # default width of Stable Diffusion
|
41 |
-
num_inference_steps =
|
42 |
guidance_scale = 8 # Scale for classifier-free guidance
|
43 |
generator = torch.manual_seed(32) # Seed generator to create the inital latent noise
|
44 |
batch_size = 1
|
@@ -100,12 +98,12 @@ def generate_mixed_image(prompt1, prompt2,noised_image=False):
|
|
100 |
|
101 |
return output
|
102 |
|
103 |
-
def generate_image(prompt,color_postprocessing=False,postporcessing_color=None,color_loss_scale=40,noised_image=False):
|
104 |
#@title Store the predicted outputs and next frame for later viewing
|
105 |
#prompt = 'A campfire (oil on canvas)' #
|
106 |
height = 512 # default height of Stable Diffusion
|
107 |
width = 512 # default width of Stable Diffusion
|
108 |
-
num_inference_steps =
|
109 |
guidance_scale = 8 # # Scale for classifier-free guidance
|
110 |
generator = torch.manual_seed(32) # Seed generator to create the inital latent noise
|
111 |
batch_size = 1
|
@@ -210,26 +208,6 @@ def generate_image(prompt,color_postprocessing=False,postporcessing_color=None,c
|
|
210 |
|
211 |
return output
|
212 |
|
213 |
-
def progress_video(prompt):
|
214 |
-
pil_image = generate_image(prompt)
|
215 |
-
# Generate a list of image file paths (replace with your own logic)
|
216 |
-
num_frames = len(os.listdir("steps/"))
|
217 |
-
image_files = [f"steps/{i:04d}.jpeg" for i in range(1, num_frames + 1)]
|
218 |
-
# Read the first image to get its size (assuming all images have the same size)
|
219 |
-
first_image = cv2.imread({image_files[0]})
|
220 |
-
height, width, _ = first_image.shape
|
221 |
-
|
222 |
-
# Define the output video writer
|
223 |
-
fourcc = cv2.VideoWriter_fourcc(*'mp4v') # Codec for MP4
|
224 |
-
out = cv2.VideoWriter('out.mp4', fourcc, 12, (width, height))
|
225 |
-
|
226 |
-
for image_file in image_files:
|
227 |
-
frame = cv2.imread(image_file)
|
228 |
-
out.write(frame)
|
229 |
-
|
230 |
-
out.release()
|
231 |
-
return "out.mp4"
|
232 |
-
|
233 |
def generate_noised_version_of_image(pil_image):
|
234 |
# View a noised version
|
235 |
encoded = pil_to_latent(pil_image,vae)
|
@@ -239,6 +217,7 @@ def generate_noised_version_of_image(pil_image):
|
|
239 |
return latents_to_pil(encoded_and_noised,vae)[0] # Display
|
240 |
|
241 |
|
|
|
242 |
# if __name__ == "__main__":
|
243 |
# prompt = 'A campfire (oil on canvas)'
|
244 |
# color_loss_scale = 40
|
@@ -248,5 +227,3 @@ def generate_noised_version_of_image(pil_image):
|
|
248 |
# #pil_image = generate_noised_version_of_image(Image.open('output.png').resize((512, 512)))
|
249 |
# pil_image.save("output1.png")
|
250 |
|
251 |
-
if __name__ == "__main__":
|
252 |
-
progress_video("lol")
|
|
|
10 |
import shutil
|
11 |
# For video display:
|
12 |
import cv2
|
|
|
|
|
13 |
import os
|
14 |
from utils import color_loss,latents_to_pil,pil_to_latent,sketch_loss
|
15 |
# Set device
|
|
|
32 |
|
33 |
scheduler.set_timesteps(15)
|
34 |
|
35 |
+
def generate_mixed_image(prompt1, prompt2,num_inference_steps=50,noised_image=False):
|
36 |
mix_factor = 0.4 #@param
|
37 |
height = 512 # default height of Stable Diffusion
|
38 |
width = 512 # default width of Stable Diffusion
|
39 |
+
num_inference_steps = num_inference_steps #@param # Number of denoising steps
|
40 |
guidance_scale = 8 # Scale for classifier-free guidance
|
41 |
generator = torch.manual_seed(32) # Seed generator to create the inital latent noise
|
42 |
batch_size = 1
|
|
|
98 |
|
99 |
return output
|
100 |
|
101 |
+
def generate_image(prompt,num_inference_steps=50,color_postprocessing=False,postporcessing_color=None,color_loss_scale=40,noised_image=False):
|
102 |
#@title Store the predicted outputs and next frame for later viewing
|
103 |
#prompt = 'A campfire (oil on canvas)' #
|
104 |
height = 512 # default height of Stable Diffusion
|
105 |
width = 512 # default width of Stable Diffusion
|
106 |
+
num_inference_steps = num_inference_steps # # Number of denoising steps
|
107 |
guidance_scale = 8 # # Scale for classifier-free guidance
|
108 |
generator = torch.manual_seed(32) # Seed generator to create the inital latent noise
|
109 |
batch_size = 1
|
|
|
208 |
|
209 |
return output
|
210 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
211 |
def generate_noised_version_of_image(pil_image):
|
212 |
# View a noised version
|
213 |
encoded = pil_to_latent(pil_image,vae)
|
|
|
217 |
return latents_to_pil(encoded_and_noised,vae)[0] # Display
|
218 |
|
219 |
|
220 |
+
|
221 |
# if __name__ == "__main__":
|
222 |
# prompt = 'A campfire (oil on canvas)'
|
223 |
# color_loss_scale = 40
|
|
|
227 |
# #pil_image = generate_noised_version_of_image(Image.open('output.png').resize((512, 512)))
|
228 |
# pil_image.save("output1.png")
|
229 |
|
|
|
|
requirements.txt
CHANGED
@@ -1,8 +1,12 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
|
|
|
|
|
|
|
|
|
1 |
+
diffusers==0.21.4
|
2 |
+
gradio==3.49.0
|
3 |
+
ipython==8.15.0
|
4 |
+
matplotlib==3.8.0
|
5 |
+
numpy==1.26.1
|
6 |
+
opencv_python==4.8.1.78
|
7 |
+
Pillow==10.1.0
|
8 |
+
Pillow==10.1.0
|
9 |
+
torch==2.1.0
|
10 |
+
torchvision==0.16.0
|
11 |
+
tqdm==4.66.1
|
12 |
+
transformers==4.25.1
|
style_embeddings/3d_female_cyborgs.bin
ADDED
Binary file (3.82 kB). View file
|
|
style_embeddings/80s_anime.bin
ADDED
Binary file (3.82 kB). View file
|
|
style_embeddings/anders_zorn.bin
ADDED
Binary file (3.82 kB). View file
|
|
style_embeddings/angus_mcbride.bin
ADDED
Binary file (3.82 kB). View file
|
|
style_embeddings/breakcore.bin
ADDED
Binary file (3.82 kB). View file
|
|
style_embeddings/brittney_williams.bin
ADDED
Binary file (3.82 kB). View file
|
|
style_embeddings/buhu_art_style.bin
ADDED
Binary file (4.86 kB). View file
|
|
style_embeddings/bull_vs_bear.bin
ADDED
Binary file (3.82 kB). View file
|
|
style_embeddings/caitlin_fairchild.bin
ADDED
Binary file (3.82 kB). View file
|
|
style_embeddings/exodus_styling.bin
ADDED
Binary file (3.82 kB). View file
|
|
style_embeddings/foorbyv2.bin
ADDED
Binary file (3.82 kB). View file
|
|
style_embeddings/learned_embeds_arcane.bin
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"payload":{"allShortcutsEnabled":false,"fileTree":{"S20":{"items":[{"name":"README.md","path":"S20/README.md","contentType":"file"},{"name":"S20-HuggingFace.ipynb","path":"S20/S20-HuggingFace.ipynb","contentType":"file"},{"name":"S20.ipynb","path":"S20/S20.ipynb","contentType":"file"},{"name":"learned_embeds_arcane.bin","path":"S20/learned_embeds_arcane.bin","contentType":"file"},{"name":"learned_embeds_dr_strange.bin","path":"S20/learned_embeds_dr_strange.bin","contentType":"file"},{"name":"learned_embeds_gta5.bin","path":"S20/learned_embeds_gta5.bin","contentType":"file"},{"name":"learned_embeds_illustration.bin","path":"S20/learned_embeds_illustration.bin","contentType":"file"},{"name":"learned_embeds_pokemon.bin","path":"S20/learned_embeds_pokemon.bin","contentType":"file"}],"totalCount":8},"":{"items":[{"name":"S10","path":"S10","contentType":"directory"},{"name":"S11","path":"S11","contentType":"directory"},{"name":"S12","path":"S12","contentType":"directory"},{"name":"S13","path":"S13","contentType":"directory"},{"name":"S15","path":"S15","contentType":"directory"},{"name":"S16","path":"S16","contentType":"directory"},{"name":"S18","path":"S18","contentType":"directory"},{"name":"S19","path":"S19","contentType":"directory"},{"name":"S20","path":"S20","contentType":"directory"},{"name":"S5","path":"S5","contentType":"directory"},{"name":"S6","path":"S6","contentType":"directory"},{"name":"S7","path":"S7","contentType":"directory"},{"name":"S8","path":"S8","contentType":"directory"},{"name":"S9","path":"S9","contentType":"directory"},{"name":"README.md","path":"README.md","contentType":"file"}],"totalCount":15}},"fileTreeProcessingTime":5.458002,"foldersToFetch":[],"reducedMotionEnabled":null,"repo":{"id":648622518,"defaultBranch":"main","name":"ERA-V1-Assignments","ownerLogin":"gupta1912","currentUserCanPush":false,"isFork":false,"isEmpty":false,"createdAt":"2023-06-02T12:02:53.000Z","ownerAvatar":"https://avatars.githubusercontent.com/u/47393639?v=4","public":true,"private":false,"isOrgOwned":false},"symbolsExpanded":false,"treeExpanded":true,"refInfo":{"name":"main","listCacheKey":"v0:1685707373.878437","canEdit":false,"refType":"branch","currentOid":"69279d066a624710a5d9143b24292e736177714d"},"path":"S20/learned_embeds_arcane.bin","currentUser":null,"blob":{"rawLines":null,"stylingDirectives":null,"csv":null,"csvError":null,"dependabotInfo":{"showConfigurationBanner":false,"configFilePath":null,"networkDependabotPath":"/gupta1912/ERA-V1-Assignments/network/updates","dismissConfigurationNoticePath":"/settings/dismiss-notice/dependabot_configuration_notice","configurationNoticeDismissed":null,"repoAlertsPath":"/gupta1912/ERA-V1-Assignments/security/dependabot","repoSecurityAndAnalysisPath":"/gupta1912/ERA-V1-Assignments/settings/security_analysis","repoOwnerIsOrg":false,"currentUserCanAdminRepo":false},"displayName":"learned_embeds_arcane.bin","displayUrl":"https://github.com/gupta1912/ERA-V1-Assignments/blob/main/S20/learned_embeds_arcane.bin?raw=true","headerInfo":{"blobSize":"3.73 KB","deleteInfo":{"deleteTooltip":"You must be signed in to make or propose changes"},"editInfo":{"editTooltip":"You must be signed in to make or propose changes"},"ghDesktopPath":"https://desktop.github.com","gitLfsPath":null,"onBranch":true,"shortPath":"3672956","siteNavLoginPath":"/login?return_to=https%3A%2F%2Fgithub.com%2Fgupta1912%2FERA-V1-Assignments%2Fblob%2Fmain%2FS20%2Flearned_embeds_arcane.bin","isCSV":false,"isRichtext":false,"toc":null,"lineInfo":{"truncatedLoc":null,"truncatedSloc":null},"mode":"file"},"image":false,"isCodeownersFile":null,"isPlain":false,"isValidLegacyIssueTemplate":false,"issueTemplateHelpUrl":"https://docs.github.com/articles/about-issue-and-pull-request-templates","issueTemplate":null,"discussionTemplate":null,"language":null,"languageID":null,"large":false,"loggedIn":false,"newDiscussionPath":"/gupta1912/ERA-V1-Assignments/discussions/new","newIssuePath":"/gupta1912/ERA-V1-Assignments/issues/new","planSupportInfo":{"repoIsFork":null,"repoOwnedByCurrentUser":null,"requestFullPath":"/gupta1912/ERA-V1-Assignments/blob/main/S20/learned_embeds_arcane.bin","showFreeOrgGatedFeatureMessage":null,"showPlanSupportBanner":null,"upgradeDataAttributes":null,"upgradePath":null},"publishBannersInfo":{"dismissActionNoticePath":"/settings/dismiss-notice/publish_action_from_dockerfile","dismissStackNoticePath":"/settings/dismiss-notice/publish_stack_from_file","releasePath":"/gupta1912/ERA-V1-Assignments/releases/new?marketplace=true","showPublishActionBanner":false,"showPublishStackBanner":false},"rawBlobUrl":"https://github.com/gupta1912/ERA-V1-Assignments/raw/main/S20/learned_embeds_arcane.bin","renderImageOrRaw":true,"richText":null,"renderedFileInfo":null,"shortPath":null,"tabSize":8,"topBannersInfo":{"overridingGlobalFundingFile":false,"globalPreferredFundingPath":null,"repoOwner":"gupta1912","repoName":"ERA-V1-Assignments","showInvalidCitationWarning":false,"citationHelpUrl":"https://docs.github.com/en/github/creating-cloning-and-archiving-repositories/creating-a-repository-on-github/about-citation-files","showDependabotConfigurationBanner":false,"actionsOnboardingTip":null},"truncated":false,"viewable":false,"workflowRedirectUrl":null,"symbols":null},"copilotInfo":null,"csrf_tokens":{"/gupta1912/ERA-V1-Assignments/branches":{"post":"BDalnmGaFsCat0pyEB2XRtEEgU1tU3hPkqod54YEtINRMIyVpZhY2QaEfjaGTzB49WIGe8mvKtoNNi3HfJg_fA"},"/repos/preferences":{"post":"9b2BsWn9aEPKIJwMKJX1lraH9ktRNImj8HovOQHsPkUk6M41p49XqC_qiNv0dz03T8TyuJBk3-bJqf53V9cV9g"}}},"title":"ERA-V1-Assignments/S20/learned_embeds_arcane.bin at main · gupta1912/ERA-V1-Assignments"}
|
style_embeddings/loaded_style_empire.bin
ADDED
Binary file (10.6 kB). View file
|
|
style_embeddings/useembeddingname-neg.bin
ADDED
Binary file (3.95 kB). View file
|
|
style_embeddings/useembeddingname.bin
ADDED
Binary file (3.95 kB). View file
|
|
style_guidence.py
ADDED
@@ -0,0 +1,176 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from transformers import CLIPTextModel, CLIPTokenizer
|
3 |
+
from diffusers import AutoencoderKL, UNet2DConditionModel, LMSDiscreteScheduler
|
4 |
+
from tqdm.auto import tqdm
|
5 |
+
from torch import autocast
|
6 |
+
from PIL import Image
|
7 |
+
from matplotlib import pyplot as plt
|
8 |
+
import numpy
|
9 |
+
from torchvision import transforms as tfms
|
10 |
+
import shutil
|
11 |
+
# For video display:
|
12 |
+
import cv2
|
13 |
+
from IPython.display import HTML
|
14 |
+
from base64 import b64encode
|
15 |
+
import os
|
16 |
+
from utils import color_loss,pil_to_latent,sketch_loss
|
17 |
+
# Set device
|
18 |
+
torch_device = "cpu"
|
19 |
+
|
20 |
+
vae = AutoencoderKL.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="vae")
|
21 |
+
|
22 |
+
# Load the tokenizer and text encoder to tokenize and encode the text.
|
23 |
+
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
|
24 |
+
text_encoder = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14")
|
25 |
+
|
26 |
+
# The UNet model for generating the latents.
|
27 |
+
unet = UNet2DConditionModel.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="unet")
|
28 |
+
|
29 |
+
# The noise scheduler
|
30 |
+
scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000)
|
31 |
+
vae = vae.to(torch_device)
|
32 |
+
text_encoder = text_encoder.to(torch_device)
|
33 |
+
unet = unet.to(torch_device)
|
34 |
+
|
35 |
+
scheduler.set_timesteps(15)
|
36 |
+
|
37 |
+
token_emb_layer = text_encoder.text_model.embeddings.token_embedding
|
38 |
+
pos_emb_layer = text_encoder.text_model.embeddings.position_embedding
|
39 |
+
|
40 |
+
position_ids = text_encoder.text_model.embeddings.position_ids[:, :77]
|
41 |
+
position_embeddings = pos_emb_layer(position_ids)
|
42 |
+
|
43 |
+
|
44 |
+
|
45 |
+
def get_output_embeds(input_embeddings):
|
46 |
+
# CLIP's text model uses causal mask, so we prepare it here:
|
47 |
+
bsz, seq_len = input_embeddings.shape[:2]
|
48 |
+
causal_attention_mask = text_encoder.text_model._build_causal_attention_mask(bsz, seq_len, dtype=input_embeddings.dtype)
|
49 |
+
|
50 |
+
# Getting the output embeddings involves calling the model with passing output_hidden_states=True
|
51 |
+
# so that it doesn't just return the pooled final predictions:
|
52 |
+
encoder_outputs = text_encoder.text_model.encoder(
|
53 |
+
inputs_embeds=input_embeddings,
|
54 |
+
attention_mask=None, # We aren't using an attention mask so that can be None
|
55 |
+
causal_attention_mask=causal_attention_mask.to(torch_device),
|
56 |
+
output_attentions=None,
|
57 |
+
output_hidden_states=True, # We want the output embs not the final output
|
58 |
+
return_dict=None,
|
59 |
+
)
|
60 |
+
|
61 |
+
# We're interested in the output hidden state only
|
62 |
+
output = encoder_outputs[0]
|
63 |
+
|
64 |
+
# There is a final layer norm we need to pass these through
|
65 |
+
output = text_encoder.text_model.final_layer_norm(output)
|
66 |
+
|
67 |
+
# And now they're ready!
|
68 |
+
return output
|
69 |
+
|
70 |
+
def set_timesteps(scheduler, num_inference_steps):
|
71 |
+
scheduler.set_timesteps(num_inference_steps)
|
72 |
+
scheduler.timesteps = scheduler.timesteps.to(torch.float32)
|
73 |
+
|
74 |
+
# def latents_to_pil(latents):
|
75 |
+
# # bath of latents -> list of images
|
76 |
+
# latents = (1 / 0.18215) * latents
|
77 |
+
# with torch.no_grad():
|
78 |
+
# image = vae.decode(latents).sample
|
79 |
+
# image = (image / 2 + 0.5).clamp(0, 1)
|
80 |
+
# image = image.detach().cpu().permute(0, 2, 3, 1).numpy()
|
81 |
+
# images = (image * 255).round().astype("uint8")
|
82 |
+
# pil_images = [Image.fromarray(image) for image in images]
|
83 |
+
# return pil_images
|
84 |
+
|
85 |
+
def latents_to_pil(latents):
|
86 |
+
# bath of latents -> list of images
|
87 |
+
latents = (1 / 0.18215) * latents
|
88 |
+
with torch.no_grad():
|
89 |
+
image = vae.decode(latents)
|
90 |
+
image = (image / 2 + 0.5).clamp(0, 1)
|
91 |
+
image = image.detach().cpu().permute(0, 2, 3, 1).numpy()
|
92 |
+
images = (image * 255).round().astype("uint8")
|
93 |
+
pil_images = [Image.fromarray(image) for image in images]
|
94 |
+
return pil_images
|
95 |
+
|
96 |
+
|
97 |
+
def generate_with_embs(text_embeddings,text_input, seed,num_inference_steps):
|
98 |
+
|
99 |
+
height = 512 # default height of Stable Diffusion
|
100 |
+
width = 512 # default width of Stable Diffusion
|
101 |
+
num_inference_steps = num_inference_steps # Number of denoising steps
|
102 |
+
guidance_scale = 7.5 # Scale for classifier-free guidance
|
103 |
+
generator = torch.manual_seed(seed) # Seed generator to create the inital latent noise
|
104 |
+
batch_size = 1
|
105 |
+
|
106 |
+
max_length = text_input.input_ids.shape[-1]
|
107 |
+
uncond_input = tokenizer(
|
108 |
+
[""] * batch_size, padding="max_length", max_length=max_length, return_tensors="pt"
|
109 |
+
)
|
110 |
+
with torch.no_grad():
|
111 |
+
uncond_embeddings = text_encoder(uncond_input.input_ids.to(torch_device))[0]
|
112 |
+
text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
|
113 |
+
|
114 |
+
# Prep Scheduler
|
115 |
+
set_timesteps(scheduler, num_inference_steps)
|
116 |
+
|
117 |
+
# Prep latents
|
118 |
+
latents = torch.randn(
|
119 |
+
(batch_size, unet.in_channels, height // 8, width // 8),
|
120 |
+
generator=generator,
|
121 |
+
)
|
122 |
+
latents = latents.to(torch_device)
|
123 |
+
# latents = latents * scheduler.init_noise_sigma
|
124 |
+
latents = latents * scheduler.sigmas[0] # Need to scale to match k
|
125 |
+
|
126 |
+
# Loop
|
127 |
+
for i, t in tqdm(enumerate(scheduler.timesteps), total=len(scheduler.timesteps)):
|
128 |
+
# expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
|
129 |
+
latent_model_input = torch.cat([latents] * 2)
|
130 |
+
sigma = scheduler.sigmas[i]
|
131 |
+
#latent_model_input = scheduler.scale_model_input(latent_model_input, t)
|
132 |
+
latent_model_input = latent_model_input / ((sigma**2 + 1) ** 0.5)
|
133 |
+
# predict the noise residual
|
134 |
+
with torch.no_grad():
|
135 |
+
noise_pred = unet(latent_model_input, t, encoder_hidden_states=text_embeddings)["sample"]
|
136 |
+
|
137 |
+
# perform guidance
|
138 |
+
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
|
139 |
+
noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
|
140 |
+
|
141 |
+
# compute the previous noisy sample x_t -> x_t-1
|
142 |
+
#latents = scheduler.step(noise_pred, t, latents).prev_sample
|
143 |
+
latents = scheduler.step(noise_pred, i, latents)["prev_sample"]
|
144 |
+
return latents_to_pil(latents)[0]
|
145 |
+
|
146 |
+
def generate_with_prompt_style(prompt, style, num_of_inf_steps=50,seed = 42):
|
147 |
+
|
148 |
+
prompt = prompt + ' in style of s'
|
149 |
+
embed = torch.load(style)
|
150 |
+
print("Keys",embed.keys())
|
151 |
+
text_input = tokenizer(prompt, padding="max_length", max_length=tokenizer.model_max_length, truncation=True, return_tensors="pt")
|
152 |
+
# for t in text_input['input_ids'][0][:20]: # We'll just look at the first 7 to save you from a wall of '<|endoftext|>'
|
153 |
+
# print(t, tokenizer.decoder.get(int(t)))
|
154 |
+
input_ids = text_input.input_ids.to(torch_device)
|
155 |
+
|
156 |
+
token_embeddings = token_emb_layer(input_ids)
|
157 |
+
# The new embedding - our special birb word
|
158 |
+
replacement_token_embedding = embed[list(embed.keys())[0]].to(torch_device)
|
159 |
+
|
160 |
+
# Insert this into the token embeddings
|
161 |
+
token_embeddings[0, torch.where(input_ids[0]==338)] = replacement_token_embedding.to(torch_device)
|
162 |
+
|
163 |
+
# Combine with pos embs
|
164 |
+
input_embeddings = token_embeddings + position_embeddings
|
165 |
+
|
166 |
+
# Feed through to get final output embs
|
167 |
+
modified_output_embeddings = get_output_embeds(input_embeddings)
|
168 |
+
|
169 |
+
# And generate an image with this:
|
170 |
+
return generate_with_embs(modified_output_embeddings, text_input, seed,num_of_inf_steps)
|
171 |
+
|
172 |
+
|
173 |
+
# prompt = 'A man sipping wine wearing a spacesuit on the moon'
|
174 |
+
# image = generate_with_prompt_style(prompt, '/home/deepanshudashora/Documents/Stable_Diffusion/caitlin_fairchild.bin')
|
175 |
+
|
176 |
+
# image.save("output.png")
|