stable-video-diffusion

Runtime error

App Files Files Community

multimodalart HF staff commited on Nov 21, 2023

Commit

aeacc98

•

1 Parent(s): a3595a3

Update app.py

Browse files

Files changed (1) hide show

app.py +127 -106

app.py CHANGED Viewed

@@ -75,116 +75,137 @@ def sample(
 ):
     output_folder = str(uuid.uuid4())
     torch.manual_seed(seed)
-    all_img_paths = [image]
     for input_img_path in all_img_paths:
-        if image.mode == "RGBA":
-            image = image.convert("RGB")
-        w, h = image.size
-        if h % 64 != 0 or w % 64 != 0:
-            width, height = map(lambda x: x - x % 64, (w, h))
-            image = image.resize((width, height))
-            print(
-                f"WARNING: Your image is of size {h}x{w} which is not divisible by 64. We are resizing to {height}x{width}!"
-            )
-        image = ToTensor()(image)
-        image = image * 2.0 - 1.0
-        image = image.unsqueeze(0).to(device)
-        H, W = image.shape[2:]
-        assert image.shape[1] == 3
-        F = 8
-        C = 4
-        shape = (num_frames, C, H // F, W // F)
-        if (H, W) != (576, 1024):
-            print(
-                "WARNING: The conditioning frame you provided is not 576x1024. This leads to suboptimal performance as model was only trained on 576x1024. Consider increasing `cond_aug`."
-            )
-        if motion_bucket_id > 255:
-            print(
-                "WARNING: High motion bucket! This may lead to suboptimal performance."
-            )
-        if fps_id < 5:
-            print("WARNING: Small fps value! This may lead to suboptimal performance.")
-        if fps_id > 30:
-            print("WARNING: Large fps value! This may lead to suboptimal performance.")
-        value_dict = {}
-        value_dict["motion_bucket_id"] = motion_bucket_id
-        value_dict["fps_id"] = fps_id
-        value_dict["cond_aug"] = cond_aug
-        value_dict["cond_frames_without_noise"] = image
-        value_dict["cond_frames"] = image + cond_aug * torch.randn_like(image)
-        value_dict["cond_aug"] = cond_aug
-        with torch.no_grad():
-            with torch.autocast(device):
-                batch, batch_uc = get_batch(
-                    get_unique_embedder_keys_from_conditioner(model.conditioner),
-                    value_dict,
-                    [1, num_frames],
-                    T=num_frames,
-                    device=device,
                 )
-                c, uc = model.conditioner.get_unconditional_conditioning(
-                    batch,
-                    batch_uc=batch_uc,
-                    force_uc_zero_embeddings=[
-                        "cond_frames",
-                        "cond_frames_without_noise",
-                    ],
                 )
-                for k in ["crossattn", "concat"]:
-                    uc[k] = repeat(uc[k], "b ... -> b t ...", t=num_frames)
-                    uc[k] = rearrange(uc[k], "b t ... -> (b t) ...", t=num_frames)
-                    c[k] = repeat(c[k], "b ... -> b t ...", t=num_frames)
-                    c[k] = rearrange(c[k], "b t ... -> (b t) ...", t=num_frames)
-                randn = torch.randn(shape, device=device)
-                additional_model_inputs = {}
-                additional_model_inputs["image_only_indicator"] = torch.zeros(
-                    2, num_frames
-                ).to(device)
-                additional_model_inputs["num_video_frames"] = batch["num_video_frames"]
-                def denoiser(input, sigma, c):
-                    return model.denoiser(
-                        model.model, input, sigma, c, **additional_model_inputs
-                    )
-                samples_z = model.sampler(denoiser, randn, cond=c, uc=uc)
-                model.en_and_decode_n_samples_a_time = decoding_t
-                samples_x = model.decode_first_stage(samples_z)
-                samples = torch.clamp((samples_x + 1.0) / 2.0, min=0.0, max=1.0)
-                os.makedirs(output_folder, exist_ok=True)
-                base_count = len(glob(os.path.join(output_folder, "*.mp4")))
-                video_path = os.path.join(output_folder, f"{base_count:06d}.mp4")
-                writer = cv2.VideoWriter(
-                    video_path,
-                    cv2.VideoWriter_fourcc(*'mp4v'),
-                    fps_id + 1,
-                    (samples.shape[-1], samples.shape[-2]),
                 )
-                samples = embed_watermark(samples)
-                samples = filter(samples)
-                vid = (
-                    (rearrange(samples, "t c h w -> t h w c") * 255)
-                    .cpu()
-                    .numpy()
-                    .astype(np.uint8)
-                )
-                for frame in vid:
-                    frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
-                    writer.write(frame)
-                writer.release()
         return video_path
 def get_unique_embedder_keys_from_conditioner(conditioner):
@@ -272,7 +293,7 @@ with gr.Blocks(css=css) as demo:
 Generate 25 frames of video from a single image with SDV-XT. [Join the waitlist](https://stability.ai/contact) for the text-to-video web experience
   ''')
   with gr.Column():
-    image = gr.Image(label="Upload your image (it will be center cropped to 1024x576)", type="pil")
     generate_btn = gr.Button("Generate")
     #with gr.Accordion("Advanced options", open=False):
     #  cond_aug = gr.Slider(label="Conditioning augmentation", value=0.02, minimum=0.0)

 ):
     output_folder = str(uuid.uuid4())
     torch.manual_seed(seed)
+    path = Path(input_path)
+    all_img_paths = []
+    if path.is_file():
+        if any([input_path.endswith(x) for x in ["jpg", "jpeg", "png"]]):
+            all_img_paths = [input_path]
+        else:
+            raise ValueError("Path is not valid image file.")
+    elif path.is_dir():
+        all_img_paths = sorted(
+            [
+                f
+                for f in path.iterdir()
+                if f.is_file() and f.suffix.lower() in [".jpg", ".jpeg", ".png"]
+            ]
+        )
+        if len(all_img_paths) == 0:
+            raise ValueError("Folder does not contain any images.")
+    else:
+        raise ValueError
     for input_img_path in all_img_paths:
+        with Image.open(input_img_path) as image:
+            if image.mode == "RGBA":
+                image = image.convert("RGB")
+            w, h = image.size
+            if h % 64 != 0 or w % 64 != 0:
+                width, height = map(lambda x: x - x % 64, (w, h))
+                image = image.resize((width, height))
+                print(
+                    f"WARNING: Your image is of size {h}x{w} which is not divisible by 64. We are resizing to {height}x{width}!"
                 )
+            image = ToTensor()(image)
+            image = image * 2.0 - 1.0
+            image = image.unsqueeze(0).to(device)
+            H, W = image.shape[2:]
+            assert image.shape[1] == 3
+            F = 8
+            C = 4
+            shape = (num_frames, C, H // F, W // F)
+            if (H, W) != (576, 1024):
+                print(
+                    "WARNING: The conditioning frame you provided is not 576x1024. This leads to suboptimal performance as model was only trained on 576x1024. Consider increasing `cond_aug`."
                 )
+            if motion_bucket_id > 255:
+                print(
+                    "WARNING: High motion bucket! This may lead to suboptimal performance."
                 )
+            if fps_id < 5:
+                print("WARNING: Small fps value! This may lead to suboptimal performance.")
+            if fps_id > 30:
+                print("WARNING: Large fps value! This may lead to suboptimal performance.")
+            value_dict = {}
+            value_dict["motion_bucket_id"] = motion_bucket_id
+            value_dict["fps_id"] = fps_id
+            value_dict["cond_aug"] = cond_aug
+            value_dict["cond_frames_without_noise"] = image
+            value_dict["cond_frames"] = image + cond_aug * torch.randn_like(image)
+            value_dict["cond_aug"] = cond_aug
+            with torch.no_grad():
+                with torch.autocast(device):
+                    batch, batch_uc = get_batch(
+                        get_unique_embedder_keys_from_conditioner(model.conditioner),
+                        value_dict,
+                        [1, num_frames],
+                        T=num_frames,
+                        device=device,
+                    )
+                    c, uc = model.conditioner.get_unconditional_conditioning(
+                        batch,
+                        batch_uc=batch_uc,
+                        force_uc_zero_embeddings=[
+                            "cond_frames",
+                            "cond_frames_without_noise",
+                        ],
+                    )
+                    for k in ["crossattn", "concat"]:
+                        uc[k] = repeat(uc[k], "b ... -> b t ...", t=num_frames)
+                        uc[k] = rearrange(uc[k], "b t ... -> (b t) ...", t=num_frames)
+                        c[k] = repeat(c[k], "b ... -> b t ...", t=num_frames)
+                        c[k] = rearrange(c[k], "b t ... -> (b t) ...", t=num_frames)
+                    randn = torch.randn(shape, device=device)
+                    additional_model_inputs = {}
+                    additional_model_inputs["image_only_indicator"] = torch.zeros(
+                        2, num_frames
+                    ).to(device)
+                    additional_model_inputs["num_video_frames"] = batch["num_video_frames"]
+                    def denoiser(input, sigma, c):
+                        return model.denoiser(
+                            model.model, input, sigma, c, **additional_model_inputs
+                        )
+                    samples_z = model.sampler(denoiser, randn, cond=c, uc=uc)
+                    model.en_and_decode_n_samples_a_time = decoding_t
+                    samples_x = model.decode_first_stage(samples_z)
+                    samples = torch.clamp((samples_x + 1.0) / 2.0, min=0.0, max=1.0)
+                    os.makedirs(output_folder, exist_ok=True)
+                    base_count = len(glob(os.path.join(output_folder, "*.mp4")))
+                    video_path = os.path.join(output_folder, f"{base_count:06d}.mp4")
+                    writer = cv2.VideoWriter(
+                        video_path,
+                        cv2.VideoWriter_fourcc(*"MP4V"),
+                        fps_id + 1,
+                        (samples.shape[-1], samples.shape[-2]),
+                    )
+                    samples = embed_watermark(samples)
+                    samples = filter(samples)
+                    vid = (
+                        (rearrange(samples, "t c h w -> t h w c") * 255)
+                        .cpu()
+                        .numpy()
+                        .astype(np.uint8)
+                    )
+                    for frame in vid:
+                        frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
+                        writer.write(frame)
+                    writer.release()
         return video_path
 def get_unique_embedder_keys_from_conditioner(conditioner):
 Generate 25 frames of video from a single image with SDV-XT. [Join the waitlist](https://stability.ai/contact) for the text-to-video web experience
   ''')
   with gr.Column():
+    image = gr.Image(label="Upload your image (it will be center cropped to 1024x576)", type="filepath")
     generate_btn = gr.Button("Generate")
     #with gr.Accordion("Advanced options", open=False):
     #  cond_aug = gr.Slider(label="Conditioning augmentation", value=0.02, minimum=0.0)