stable-video-diffusion

Runtime error

App Files Files Community

pcuenq HF staff commited on Nov 24, 2023

Commit

0cd72ee

•

1 Parent(s): d41d75b

resource-consumption (#3)

Browse files

- Remove caches and sample videos (a69cc9b946ad8f287bfd5408d81fff425ae392af)
- Ignore __pycache__ (5ec1f00964ee885250b942fe810cb1142f40514d)
- Reduce memory by ~1.7 GB, without affecting speed too much. (b8850be073dd7bf6f594579e4cd433b31383c771)
- Improve error reporting (7865d267ab23aeb9fd8dc00ba6bce2f679f2f865)
- Use PIL instead of path for image component. (4ebb8b5bbd3b8d5a32edb65fa63a43462252ccb6)

Files changed (43) hide show

.gitignore +1 -0
app.py +103 -133
outputs/000000.mp4 +0 -0
outputs/000001.mp4 +0 -0
outputs/000002.mp4 +0 -0
outputs/000003.mp4 +0 -0
outputs/000004.mp4 +0 -3
outputs/000005.mp4 +0 -0
outputs/simple_video_sample/svd_xt/000000.mp4 +0 -0
scripts/__pycache__/__init__.cpython-310.pyc +0 -0
scripts/util/__pycache__/__init__.cpython-310.pyc +0 -0
scripts/util/detection/__pycache__/__init__.cpython-310.pyc +0 -0
scripts/util/detection/__pycache__/nsfw_and_watermark_dectection.cpython-310.pyc +0 -0
sgm/__pycache__/__init__.cpython-310.pyc +0 -0
sgm/__pycache__/util.cpython-310.pyc +0 -0
sgm/inference/__pycache__/helpers.cpython-310.pyc +0 -0
sgm/models/__pycache__/__init__.cpython-310.pyc +0 -0
sgm/models/__pycache__/autoencoder.cpython-310.pyc +0 -0
sgm/models/__pycache__/diffusion.cpython-310.pyc +0 -0
sgm/modules/__pycache__/__init__.cpython-310.pyc +0 -0
sgm/modules/__pycache__/attention.cpython-310.pyc +0 -0
sgm/modules/__pycache__/ema.cpython-310.pyc +0 -0
sgm/modules/__pycache__/video_attention.cpython-310.pyc +0 -0
sgm/modules/autoencoding/__pycache__/__init__.cpython-310.pyc +0 -0
sgm/modules/autoencoding/__pycache__/temporal_ae.cpython-310.pyc +0 -0
sgm/modules/autoencoding/regularizers/__pycache__/__init__.cpython-310.pyc +0 -0
sgm/modules/autoencoding/regularizers/__pycache__/base.cpython-310.pyc +0 -0
sgm/modules/diffusionmodules/__pycache__/__init__.cpython-310.pyc +0 -0
sgm/modules/diffusionmodules/__pycache__/denoiser.cpython-310.pyc +0 -0
sgm/modules/diffusionmodules/__pycache__/denoiser_scaling.cpython-310.pyc +0 -0
sgm/modules/diffusionmodules/__pycache__/discretizer.cpython-310.pyc +0 -0
sgm/modules/diffusionmodules/__pycache__/guiders.cpython-310.pyc +0 -0
sgm/modules/diffusionmodules/__pycache__/model.cpython-310.pyc +0 -0
sgm/modules/diffusionmodules/__pycache__/openaimodel.cpython-310.pyc +0 -0
sgm/modules/diffusionmodules/__pycache__/sampling.cpython-310.pyc +0 -0
sgm/modules/diffusionmodules/__pycache__/sampling_utils.cpython-310.pyc +0 -0
sgm/modules/diffusionmodules/__pycache__/util.cpython-310.pyc +0 -0
sgm/modules/diffusionmodules/__pycache__/video_model.cpython-310.pyc +0 -0
sgm/modules/diffusionmodules/__pycache__/wrappers.cpython-310.pyc +0 -0
sgm/modules/distributions/__pycache__/__init__.cpython-310.pyc +0 -0
sgm/modules/distributions/__pycache__/distributions.cpython-310.pyc +0 -0
sgm/modules/encoders/__pycache__/__init__.cpython-310.pyc +0 -0
sgm/modules/encoders/__pycache__/modules.cpython-310.pyc +0 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ __pycache__/

app.py CHANGED Viewed

@@ -69,158 +69,130 @@ model, filter = load_model(
 )
 def sample(
-    input_path: str = "assets/test_image.png",  # Can either be image file or folder with image files
     seed: Optional[int] = None,
     randomize_seed: bool = True,
     motion_bucket_id: int = 127,
     fps_id: int = 6,
     version: str = "svd_xt",
     cond_aug: float = 0.02,
-    decoding_t: int = 7,  # Number of frames decoded at a time! This eats most VRAM. Reduce if necessary.
     device: str = "cuda",
     output_folder: str = "outputs",
     progress=gr.Progress(track_tqdm=True)
 ):
-    """
-    Simple script to generate a single sample conditioned on an image `input_path` or multiple images, one for each
-    image file in folder `input_path`. If you run out of VRAM, try decreasing `decoding_t`.
-    """
     if(randomize_seed):
         seed = random.randint(0, max_64_bit_int)
     torch.manual_seed(seed)
-    path = Path(input_path)
-    all_img_paths = []
-    if path.is_file():
-        if any([input_path.endswith(x) for x in ["jpg", "jpeg", "png"]]):
-            all_img_paths = [input_path]
-        else:
-            raise ValueError("Path is not valid image file.")
-    elif path.is_dir():
-        all_img_paths = sorted(
-            [
-                f
-                for f in path.iterdir()
-                if f.is_file() and f.suffix.lower() in [".jpg", ".jpeg", ".png"]
-            ]
         )
-        if len(all_img_paths) == 0:
-            raise ValueError("Folder does not contain any images.")
-    else:
-        raise ValueError
-    for input_img_path in all_img_paths:
-        with Image.open(input_img_path) as image:
-            if image.mode == "RGBA":
-                image = image.convert("RGB")
-            w, h = image.size
-            if h % 64 != 0 or w % 64 != 0:
-                width, height = map(lambda x: x - x % 64, (w, h))
-                image = image.resize((width, height))
-                print(
-                    f"WARNING: Your image is of size {h}x{w} which is not divisible by 64. We are resizing to {height}x{width}!"
-                )
-            image = ToTensor()(image)
-            image = image * 2.0 - 1.0
-        image = image.unsqueeze(0).to(device)
-        H, W = image.shape[2:]
-        assert image.shape[1] == 3
-        F = 8
-        C = 4
-        shape = (num_frames, C, H // F, W // F)
-        if (H, W) != (576, 1024):
-            print(
-                "WARNING: The conditioning frame you provided is not 576x1024. This leads to suboptimal performance as model was only trained on 576x1024. Consider increasing `cond_aug`."
             )
-        if motion_bucket_id > 255:
-            print(
-                "WARNING: High motion bucket! This may lead to suboptimal performance."
             )
-        if fps_id < 5:
-            print("WARNING: Small fps value! This may lead to suboptimal performance.")
-        if fps_id > 30:
-            print("WARNING: Large fps value! This may lead to suboptimal performance.")
-        value_dict = {}
-        value_dict["motion_bucket_id"] = motion_bucket_id
-        value_dict["fps_id"] = fps_id
-        value_dict["cond_aug"] = cond_aug
-        value_dict["cond_frames_without_noise"] = image
-        value_dict["cond_frames"] = image + cond_aug * torch.randn_like(image)
-        value_dict["cond_aug"] = cond_aug
-        with torch.no_grad():
-            with torch.autocast(device):
-                batch, batch_uc = get_batch(
-                    get_unique_embedder_keys_from_conditioner(model.conditioner),
-                    value_dict,
-                    [1, num_frames],
-                    T=num_frames,
-                    device=device,
-                )
-                c, uc = model.conditioner.get_unconditional_conditioning(
-                    batch,
-                    batch_uc=batch_uc,
-                    force_uc_zero_embeddings=[
-                        "cond_frames",
-                        "cond_frames_without_noise",
-                    ],
-                )
-                for k in ["crossattn", "concat"]:
-                    uc[k] = repeat(uc[k], "b ... -> b t ...", t=num_frames)
-                    uc[k] = rearrange(uc[k], "b t ... -> (b t) ...", t=num_frames)
-                    c[k] = repeat(c[k], "b ... -> b t ...", t=num_frames)
-                    c[k] = rearrange(c[k], "b t ... -> (b t) ...", t=num_frames)
-                randn = torch.randn(shape, device=device)
-                additional_model_inputs = {}
-                additional_model_inputs["image_only_indicator"] = torch.zeros(
-                    2, num_frames
-                ).to(device)
-                additional_model_inputs["num_video_frames"] = batch["num_video_frames"]
-                def denoiser(input, sigma, c):
-                    return model.denoiser(
-                        model.model, input, sigma, c, **additional_model_inputs
-                    )
-                samples_z = model.sampler(denoiser, randn, cond=c, uc=uc)
-                model.en_and_decode_n_samples_a_time = decoding_t
-                samples_x = model.decode_first_stage(samples_z)
-                samples = torch.clamp((samples_x + 1.0) / 2.0, min=0.0, max=1.0)
-                os.makedirs(output_folder, exist_ok=True)
-                base_count = len(glob(os.path.join(output_folder, "*.mp4")))
-                video_path = os.path.join(output_folder, f"{base_count:06d}.mp4")
-                writer = cv2.VideoWriter(
-                    video_path,
-                    cv2.VideoWriter_fourcc(*"mp4v"),
-                    fps_id + 1,
-                    (samples.shape[-1], samples.shape[-2]),
-                )
-                samples = embed_watermark(samples)
-                samples = filter(samples)
-                vid = (
-                    (rearrange(samples, "t c h w -> t h w c") * 255)
-                    .cpu()
-                    .numpy()
-                    .astype(np.uint8)
                 )
-                for frame in vid:
-                    frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
-                    writer.write(frame)
-                writer.release()
-        return video_path, seed
 def get_unique_embedder_keys_from_conditioner(conditioner):
     return list(set([x.input_key for x in conditioner.embedders]))
@@ -266,8 +238,7 @@ def get_batch(keys, value_dict, N, T, device):
             batch_uc[key] = torch.clone(batch[key])
     return batch, batch_uc
-def resize_image(image_path, output_size=(1024, 576)):
-    image = Image.open(image_path)
     # Calculate aspect ratios
     target_aspect = output_size[0] / output_size[1]  # Aspect ratio of the desired size
     image_aspect = image.width / image.height  # Aspect ratio of the original image
@@ -296,7 +267,6 @@ def resize_image(image_path, output_size=(1024, 576)):
     # Crop the image
     cropped_image = resized_image.crop((left, top, right, bottom))
     return cropped_image
 with gr.Blocks() as demo:
@@ -305,7 +275,7 @@ with gr.Blocks() as demo:
   ''')
   with gr.Row():
     with gr.Column():
-        image = gr.Image(label="Upload your image", type="filepath")
         generate_btn = gr.Button("Generate")
     video = gr.Video()
   with gr.Accordion("Advanced options", open=False):

 )
 def sample(
+    image: Image,
     seed: Optional[int] = None,
     randomize_seed: bool = True,
     motion_bucket_id: int = 127,
     fps_id: int = 6,
     version: str = "svd_xt",
     cond_aug: float = 0.02,
+    decoding_t: int = 5,  # Number of frames decoded at a time! This eats most VRAM. Reduce if necessary.
     device: str = "cuda",
     output_folder: str = "outputs",
     progress=gr.Progress(track_tqdm=True)
 ):
     if(randomize_seed):
         seed = random.randint(0, max_64_bit_int)
     torch.manual_seed(seed)
+    if image.mode == "RGBA":
+        image = image.convert("RGB")
+    w, h = image.size
+    if h % 64 != 0 or w % 64 != 0:
+        width, height = map(lambda x: x - x % 64, (w, h))
+        image = image.resize((width, height))
+        print(
+            f"WARNING: Your image is of size {h}x{w} which is not divisible by 64. We are resizing to {height}x{width}!"
+        )
+    image = ToTensor()(image)
+    image = image * 2.0 - 1.0
+    image = image.unsqueeze(0).to(device)
+    H, W = image.shape[2:]
+    assert image.shape[1] == 3
+    F = 8
+    C = 4
+    shape = (num_frames, C, H // F, W // F)
+    if (H, W) != (576, 1024):
+        print(
+            "WARNING: The conditioning frame you provided is not 576x1024. This leads to suboptimal performance as model was only trained on 576x1024. Consider increasing `cond_aug`."
+        )
+    if motion_bucket_id > 255:
+        print(
+            "WARNING: High motion bucket! This may lead to suboptimal performance."
         )
+    if fps_id < 5:
+        print("WARNING: Small fps value! This may lead to suboptimal performance.")
+    if fps_id > 30:
+        print("WARNING: Large fps value! This may lead to suboptimal performance.")
+    value_dict = {}
+    value_dict["motion_bucket_id"] = motion_bucket_id
+    value_dict["fps_id"] = fps_id
+    value_dict["cond_aug"] = cond_aug
+    value_dict["cond_frames_without_noise"] = image
+    value_dict["cond_frames"] = image + cond_aug * torch.randn_like(image)
+    value_dict["cond_aug"] = cond_aug
+    with torch.no_grad():
+        with torch.autocast(device):
+            batch, batch_uc = get_batch(
+                get_unique_embedder_keys_from_conditioner(model.conditioner),
+                value_dict,
+                [1, num_frames],
+                T=num_frames,
+                device=device,
             )
+            c, uc = model.conditioner.get_unconditional_conditioning(
+                batch,
+                batch_uc=batch_uc,
+                force_uc_zero_embeddings=[
+                    "cond_frames",
+                    "cond_frames_without_noise",
+                ],
             )
+            for k in ["crossattn", "concat"]:
+                uc[k] = repeat(uc[k], "b ... -> b t ...", t=num_frames)
+                uc[k] = rearrange(uc[k], "b t ... -> (b t) ...", t=num_frames)
+                c[k] = repeat(c[k], "b ... -> b t ...", t=num_frames)
+                c[k] = rearrange(c[k], "b t ... -> (b t) ...", t=num_frames)
+            randn = torch.randn(shape, device=device)
+            additional_model_inputs = {}
+            additional_model_inputs["image_only_indicator"] = torch.zeros(
+                2, num_frames
+            ).to(device)
+            additional_model_inputs["num_video_frames"] = batch["num_video_frames"]
+            def denoiser(input, sigma, c):
+                return model.denoiser(
+                    model.model, input, sigma, c, **additional_model_inputs
                 )
+            samples_z = model.sampler(denoiser, randn, cond=c, uc=uc)
+            model.en_and_decode_n_samples_a_time = decoding_t
+            samples_x = model.decode_first_stage(samples_z)
+            samples = torch.clamp((samples_x + 1.0) / 2.0, min=0.0, max=1.0)
+            os.makedirs(output_folder, exist_ok=True)
+            base_count = len(glob(os.path.join(output_folder, "*.mp4")))
+            video_path = os.path.join(output_folder, f"{base_count:06d}.mp4")
+            writer = cv2.VideoWriter(
+                video_path,
+                cv2.VideoWriter_fourcc(*"mp4v"),
+                fps_id + 1,
+                (samples.shape[-1], samples.shape[-2]),
+            )
+            samples = embed_watermark(samples)
+            samples = filter(samples)
+            vid = (
+                (rearrange(samples, "t c h w -> t h w c") * 255)
+                .cpu()
+                .numpy()
+                .astype(np.uint8)
+            )
+            for frame in vid:
+                frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
+                writer.write(frame)
+            writer.release()
+    return video_path, seed
 def get_unique_embedder_keys_from_conditioner(conditioner):
     return list(set([x.input_key for x in conditioner.embedders]))
             batch_uc[key] = torch.clone(batch[key])
     return batch, batch_uc
+def resize_image(image, output_size=(1024, 576)):
     # Calculate aspect ratios
     target_aspect = output_size[0] / output_size[1]  # Aspect ratio of the desired size
     image_aspect = image.width / image.height  # Aspect ratio of the original image
     # Crop the image
     cropped_image = resized_image.crop((left, top, right, bottom))
     return cropped_image
 with gr.Blocks() as demo:
   ''')
   with gr.Row():
     with gr.Column():
+        image = gr.Image(label="Upload your image", type="pil")
         generate_btn = gr.Button("Generate")
     video = gr.Video()
   with gr.Accordion("Advanced options", open=False):

outputs/000000.mp4 DELETED Viewed

Binary file (297 kB)

outputs/000001.mp4 DELETED Viewed

Binary file (297 kB)

outputs/000002.mp4 DELETED Viewed

Binary file (255 kB)

outputs/000003.mp4 DELETED Viewed

Binary file (288 kB)

outputs/000004.mp4 DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c2cc34e39dd8c5d2022de56d1d83936ac2b7a286ab0351895f1b83e00a9e2fa7
-size 1574414

outputs/000005.mp4 DELETED Viewed

Binary file (265 kB)

outputs/simple_video_sample/svd_xt/000000.mp4 DELETED Viewed

Binary file (298 kB)

scripts/__pycache__/__init__.cpython-310.pyc DELETED Viewed

Binary file (154 Bytes)

scripts/util/__pycache__/__init__.cpython-310.pyc DELETED Viewed

Binary file (159 Bytes)

scripts/util/detection/__pycache__/__init__.cpython-310.pyc DELETED Viewed

Binary file (169 Bytes)

scripts/util/detection/__pycache__/nsfw_and_watermark_dectection.cpython-310.pyc DELETED Viewed

Binary file (3.9 kB)

sgm/__pycache__/__init__.cpython-310.pyc DELETED Viewed

Binary file (330 Bytes)

sgm/__pycache__/util.cpython-310.pyc DELETED Viewed

Binary file (9.45 kB)

sgm/inference/__pycache__/helpers.cpython-310.pyc DELETED Viewed

Binary file (8.87 kB)

sgm/models/__pycache__/__init__.cpython-310.pyc DELETED Viewed

Binary file (260 Bytes)

sgm/models/__pycache__/autoencoder.cpython-310.pyc DELETED Viewed

Binary file (19.2 kB)

sgm/models/__pycache__/diffusion.cpython-310.pyc DELETED Viewed

Binary file (10.9 kB)

sgm/modules/__pycache__/__init__.cpython-310.pyc DELETED Viewed

Binary file (321 Bytes)

sgm/modules/__pycache__/attention.cpython-310.pyc DELETED Viewed

Binary file (18 kB)

sgm/modules/__pycache__/ema.cpython-310.pyc DELETED Viewed

Binary file (3.22 kB)

sgm/modules/__pycache__/video_attention.cpython-310.pyc DELETED Viewed

Binary file (6.27 kB)

sgm/modules/autoencoding/__pycache__/__init__.cpython-310.pyc DELETED Viewed

Binary file (171 Bytes)

sgm/modules/autoencoding/__pycache__/temporal_ae.cpython-310.pyc DELETED Viewed

Binary file (8.48 kB)

sgm/modules/autoencoding/regularizers/__pycache__/__init__.cpython-310.pyc DELETED Viewed

Binary file (1.5 kB)

sgm/modules/autoencoding/regularizers/__pycache__/base.cpython-310.pyc DELETED Viewed

Binary file (2.04 kB)

sgm/modules/diffusionmodules/__pycache__/__init__.cpython-310.pyc DELETED Viewed

Binary file (175 Bytes)

sgm/modules/diffusionmodules/__pycache__/denoiser.cpython-310.pyc DELETED Viewed

Binary file (3.09 kB)

sgm/modules/diffusionmodules/__pycache__/denoiser_scaling.cpython-310.pyc DELETED Viewed

Binary file (2.45 kB)

sgm/modules/diffusionmodules/__pycache__/discretizer.cpython-310.pyc DELETED Viewed

Binary file (3 kB)

sgm/modules/diffusionmodules/__pycache__/guiders.cpython-310.pyc DELETED Viewed

Binary file (3.96 kB)

sgm/modules/diffusionmodules/__pycache__/model.cpython-310.pyc DELETED Viewed

Binary file (16.5 kB)

sgm/modules/diffusionmodules/__pycache__/openaimodel.cpython-310.pyc DELETED Viewed

Binary file (21.7 kB)

sgm/modules/diffusionmodules/__pycache__/sampling.cpython-310.pyc DELETED Viewed

Binary file (11.8 kB)

sgm/modules/diffusionmodules/__pycache__/sampling_utils.cpython-310.pyc DELETED Viewed

Binary file (1.53 kB)

sgm/modules/diffusionmodules/__pycache__/util.cpython-310.pyc DELETED Viewed

Binary file (11.7 kB)

sgm/modules/diffusionmodules/__pycache__/video_model.cpython-310.pyc DELETED Viewed

Binary file (8.21 kB)

sgm/modules/diffusionmodules/__pycache__/wrappers.cpython-310.pyc DELETED Viewed

Binary file (1.69 kB)

sgm/modules/distributions/__pycache__/__init__.cpython-310.pyc DELETED Viewed

Binary file (172 Bytes)

sgm/modules/distributions/__pycache__/distributions.cpython-310.pyc DELETED Viewed

Binary file (3.77 kB)

sgm/modules/encoders/__pycache__/__init__.cpython-310.pyc DELETED Viewed

Binary file (167 Bytes)

sgm/modules/encoders/__pycache__/modules.cpython-310.pyc DELETED Viewed

Binary file (29.5 kB)