0xqtpie commited on
Commit
d8967d5
1 Parent(s): e4cacd9
Files changed (2) hide show
  1. app.py +294 -0
  2. requirements.txt +21 -0
app.py ADDED
@@ -0,0 +1,294 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+
3
+ import os
4
+ import random
5
+
6
+ import gradio as gr
7
+ import numpy as np
8
+ import PIL.Image
9
+ import torch
10
+ import torchvision.transforms.functional as TF
11
+ from diffusers import (
12
+ AutoencoderKL,
13
+ EulerAncestralDiscreteScheduler,
14
+ StableDiffusionXLAdapterPipeline,
15
+ T2IAdapter,
16
+ )
17
+
18
+ from modelscope.pipelines import pipeline
19
+ from modelscope.outputs import OutputKeys
20
+
21
+ DESCRIPTION = '''# doodle2vid
22
+ Combining T2I-Adapter-SDXL with MS-Image2Video to create a doodle to video pipeline.
23
+ Shout-out to [fffiloni](https://huggingface.co/fffiloni) & [ARC Lab, Tencent PCG](https://huggingface.co/TencentARC) 🗣️
24
+
25
+ How to use: Draw a doodle in the canvas, and click "Run" to generate a video.
26
+ You can also provide a prompt with more details and choose a style.
27
+ '''
28
+
29
+ if not torch.cuda.is_available():
30
+ DESCRIPTION += "\n<p>Running on CPU 🥶 This demo does not work on CPU.</p>"
31
+
32
+ style_list = [
33
+ {
34
+ "name": "(No style)",
35
+ "prompt": "{prompt}",
36
+ "negative_prompt": "",
37
+ },
38
+ {
39
+ "name": "Cinematic",
40
+ "prompt": "cinematic still {prompt} . emotional, harmonious, vignette, highly detailed, high budget, bokeh, cinemascope, moody, epic, gorgeous, film grain, grainy",
41
+ "negative_prompt": "anime, cartoon, graphic, text, painting, crayon, graphite, abstract, glitch, deformed, mutated, ugly, disfigured",
42
+ },
43
+ {
44
+ "name": "3D Model",
45
+ "prompt": "professional 3d model {prompt} . octane render, highly detailed, volumetric, dramatic lighting",
46
+ "negative_prompt": "ugly, deformed, noisy, low poly, blurry, painting",
47
+ },
48
+ {
49
+ "name": "Anime",
50
+ "prompt": "anime artwork {prompt} . anime style, key visual, vibrant, studio anime, highly detailed",
51
+ "negative_prompt": "photo, deformed, black and white, realism, disfigured, low contrast",
52
+ },
53
+ {
54
+ "name": "Digital Art",
55
+ "prompt": "concept art {prompt} . digital artwork, illustrative, painterly, matte painting, highly detailed",
56
+ "negative_prompt": "photo, photorealistic, realism, ugly",
57
+ },
58
+ {
59
+ "name": "Photographic",
60
+ "prompt": "cinematic photo {prompt} . 35mm photograph, film, bokeh, professional, 4k, highly detailed",
61
+ "negative_prompt": "drawing, painting, crayon, sketch, graphite, impressionist, noisy, blurry, soft, deformed, ugly",
62
+ },
63
+ {
64
+ "name": "Pixel art",
65
+ "prompt": "pixel-art {prompt} . low-res, blocky, pixel art style, 8-bit graphics",
66
+ "negative_prompt": "sloppy, messy, blurry, noisy, highly detailed, ultra textured, photo, realistic",
67
+ },
68
+ {
69
+ "name": "Fantasy art",
70
+ "prompt": "ethereal fantasy concept art of {prompt} . magnificent, celestial, ethereal, painterly, epic, majestic, magical, fantasy art, cover art, dreamy",
71
+ "negative_prompt": "photographic, realistic, realism, 35mm film, dslr, cropped, frame, text, deformed, glitch, noise, noisy, off-center, deformed, cross-eyed, closed eyes, bad anatomy, ugly, disfigured, sloppy, duplicate, mutated, black and white",
72
+ },
73
+ {
74
+ "name": "Neonpunk",
75
+ "prompt": "neonpunk style {prompt} . cyberpunk, vaporwave, neon, vibes, vibrant, stunningly beautiful, crisp, detailed, sleek, ultramodern, magenta highlights, dark purple shadows, high contrast, cinematic, ultra detailed, intricate, professional",
76
+ "negative_prompt": "painting, drawing, illustration, glitch, deformed, mutated, cross-eyed, ugly, disfigured",
77
+ },
78
+ {
79
+ "name": "Manga",
80
+ "prompt": "manga style {prompt} . vibrant, high-energy, detailed, iconic, Japanese comic style",
81
+ "negative_prompt": "ugly, deformed, noisy, blurry, low contrast, realism, photorealistic, Western comic style",
82
+ },
83
+ ]
84
+
85
+ styles = {k["name"]: (k["prompt"], k["negative_prompt"]) for k in style_list}
86
+ STYLE_NAMES = list(styles.keys())
87
+ DEFAULT_STYLE_NAME = "(No style)"
88
+
89
+
90
+ def apply_style(style_name: str, positive: str, negative: str = "") -> tuple[str, str]:
91
+ p, n = styles.get(style_name, styles[DEFAULT_STYLE_NAME])
92
+ return p.replace("{prompt}", positive), n + negative
93
+
94
+
95
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
96
+ if torch.cuda.is_available():
97
+ model_id = "stabilityai/stable-diffusion-xl-base-1.0"
98
+ adapter = T2IAdapter.from_pretrained(
99
+ "TencentARC/t2i-adapter-sketch-sdxl-1.0", torch_dtype=torch.float16, variant="fp16"
100
+ )
101
+ scheduler = EulerAncestralDiscreteScheduler.from_pretrained(model_id, subfolder="scheduler")
102
+ pipe = StableDiffusionXLAdapterPipeline.from_pretrained(
103
+ model_id,
104
+ vae=AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16),
105
+ adapter=adapter,
106
+ scheduler=scheduler,
107
+ torch_dtype=torch.float16,
108
+ variant="fp16",
109
+ )
110
+ pipe.to(device)
111
+ else:
112
+ pipe = None
113
+
114
+ MAX_SEED = np.iinfo(np.int32).max
115
+ video_pipe = pipeline(task='image-to-video', model='damo/Image-to-Video', model_revision='v1.1.0')
116
+
117
+
118
+ def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
119
+ if randomize_seed:
120
+ seed = random.randint(0, MAX_SEED)
121
+ return seed
122
+
123
+ def inferVideo(image: PIL.Image.Image) -> str:
124
+ # Save the passed image to a temp file
125
+ temp_path = "temp_input_image.png"
126
+ image.save(temp_path)
127
+
128
+ output_video_path = video_pipe(temp_path, output_video='output.mp4')[OutputKeys.OUTPUT_VIDEO]
129
+ print(output_video_path)
130
+ return output_video_path
131
+
132
+ def inferImage(
133
+ image: PIL.Image.Image,
134
+ prompt: str,
135
+ negative_prompt: str,
136
+ style_name: str = DEFAULT_STYLE_NAME,
137
+ num_steps: int = 25,
138
+ guidance_scale: float = 5,
139
+ adapter_conditioning_scale: float = 0.8,
140
+ adapter_conditioning_factor: float = 0.8,
141
+ seed: int = 0,
142
+ progress=gr.Progress(track_tqdm=True),
143
+ ) -> PIL.Image.Image:
144
+ image = image.convert("RGB")
145
+ image = TF.to_tensor(image) > 0.5
146
+ image = TF.to_pil_image(image.to(torch.float32))
147
+
148
+ prompt, negative_prompt = apply_style(style_name, prompt, negative_prompt)
149
+
150
+ generator = torch.Generator(device=device).manual_seed(seed)
151
+ out = pipe(
152
+ prompt=prompt,
153
+ negative_prompt=negative_prompt,
154
+ image=image,
155
+ num_inference_steps=num_steps,
156
+ generator=generator,
157
+ guidance_scale=guidance_scale,
158
+ adapter_conditioning_scale=adapter_conditioning_scale,
159
+ adapter_conditioning_factor=adapter_conditioning_factor,
160
+ ).images[0]
161
+
162
+ return out
163
+
164
+
165
+ with gr.Blocks(css="style.css") as demo:
166
+ gr.Markdown(DESCRIPTION, elem_id="description")
167
+
168
+ with gr.Row():
169
+ with gr.Column():
170
+ with gr.Group():
171
+ image = gr.Image(
172
+ source="canvas",
173
+ tool="sketch",
174
+ type="pil",
175
+ image_mode="L",
176
+ invert_colors=True,
177
+ shape=(1024, 1024),
178
+ brush_radius=4,
179
+ height=440,
180
+ )
181
+ prompt = gr.Textbox(label="Prompt")
182
+ style = gr.Dropdown(label="Style", choices=STYLE_NAMES, value=DEFAULT_STYLE_NAME)
183
+ run_button = gr.Button("Run")
184
+ with gr.Accordion("Advanced options", open=False):
185
+ negative_prompt = gr.Textbox(
186
+ label="Negative prompt",
187
+ value=" extra digit, fewer digits, cropped, worst quality, low quality, glitch, deformed, mutated, ugly, disfigured",
188
+ )
189
+ num_steps = gr.Slider(
190
+ label="Number of steps",
191
+ minimum=1,
192
+ maximum=50,
193
+ step=1,
194
+ value=25,
195
+ )
196
+ guidance_scale = gr.Slider(
197
+ label="Guidance scale",
198
+ minimum=0.1,
199
+ maximum=10.0,
200
+ step=0.1,
201
+ value=5,
202
+ )
203
+ adapter_conditioning_scale = gr.Slider(
204
+ label="Adapter conditioning scale",
205
+ minimum=0.5,
206
+ maximum=1,
207
+ step=0.1,
208
+ value=0.8,
209
+ )
210
+ adapter_conditioning_factor = gr.Slider(
211
+ label="Adapter conditioning factor",
212
+ info="Fraction of timesteps for which adapter should be applied",
213
+ minimum=0.5,
214
+ maximum=1,
215
+ step=0.1,
216
+ value=0.8,
217
+ )
218
+ seed = gr.Slider(
219
+ label="Seed",
220
+ minimum=0,
221
+ maximum=MAX_SEED,
222
+ step=1,
223
+ value=0,
224
+ )
225
+ randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
226
+ with gr.Column():
227
+ result_image = gr.Image(label="Intermediate Image Output", height=400)
228
+ result_video = gr.Video(label="Final Video Output", height=400)
229
+
230
+ inputs = [
231
+ image,
232
+ prompt,
233
+ negative_prompt,
234
+ style,
235
+ num_steps,
236
+ guidance_scale,
237
+ adapter_conditioning_scale,
238
+ adapter_conditioning_factor,
239
+ seed,
240
+ ]
241
+ prompt.submit(
242
+ fn=randomize_seed_fn,
243
+ inputs=[seed, randomize_seed],
244
+ outputs=seed,
245
+ queue=False,
246
+ api_name=False,
247
+ ).then(
248
+ fn=inferImage,
249
+ inputs=inputs,
250
+ outputs=result_image,
251
+ api_name=False,
252
+ ).then(
253
+ fn=inferVideo,
254
+ inputs=result_image,
255
+ outputs=result_video,
256
+ api_name=False,
257
+ )
258
+ negative_prompt.submit(
259
+ fn=randomize_seed_fn,
260
+ inputs=[seed, randomize_seed],
261
+ outputs=seed,
262
+ queue=False,
263
+ api_name=False,
264
+ ).then(
265
+ fn=inferImage,
266
+ inputs=inputs,
267
+ outputs=result_image,
268
+ api_name=False,
269
+ ).then(
270
+ fn=inferVideo,
271
+ inputs=result_image,
272
+ outputs=result_video,
273
+ api_name=False,
274
+ )
275
+ run_button.click(
276
+ fn=randomize_seed_fn,
277
+ inputs=[seed, randomize_seed],
278
+ outputs=seed,
279
+ queue=False,
280
+ api_name=False,
281
+ ).then(
282
+ fn=inferImage,
283
+ inputs=inputs,
284
+ outputs=result_image,
285
+ api_name=False,
286
+ ).then(
287
+ fn=inferVideo,
288
+ inputs=result_image,
289
+ outputs=result_video,
290
+ api_name=False,
291
+ )
292
+
293
+ if __name__ == "__main__":
294
+ demo.queue(max_size=20).launch()
requirements.txt ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accelerate==0.22.0
2
+ git+https://github.com/huggingface/diffusers@t2i-adapter-load-lora
3
+ gradio==3.43.1
4
+ Pillow==10.0.0
5
+ safetensors==0.3.3
6
+ torch==2.0.1
7
+ torchvision==0.15.2
8
+ transformers==4.33.1
9
+ xformers==0.0.20
10
+ modelscope==1.8.4
11
+ open_clip_torch>=2.0.2
12
+ opencv-python-headless
13
+ opencv-python
14
+ einops>=0.4
15
+ rotary-embedding-torch
16
+ fairscale
17
+ scipy
18
+ imageio
19
+ pytorch-lightning
20
+ torchsde
21
+ easydict