Spaces:

multimodalart
/

stable-video-diffusion

Running on Zero

App Files Files Community

multimodalart HF staff commited on Nov 21, 2023

Commit

3b16b97

1 Parent(s): 2e6eea6

Update app.py

Browse files

Files changed (1) hide show

app.py +4 -165

app.py CHANGED Viewed

@@ -19,6 +19,8 @@ from sgm.inference.helpers import embed_watermark
 from sgm.util import default, instantiate_from_config
 from huggingface_hub import hf_hub_download
 num_frames = 25
 num_steps = 30
 model_config = "scripts/sampling/configs/svd_xt.yaml"
@@ -30,38 +32,6 @@ css = '''
 .gradio-container{max-width:850px !important}
 '''
-def load_model(
-    config: str,
-    device: str,
-    num_frames: int,
-    num_steps: int,
-):
-    config = OmegaConf.load(config)
-    if device == "cuda":
-        config.model.params.conditioner_config.params.emb_models[
-            0
-        ].params.open_clip_embedding_config.params.init_device = device
-    config.model.params.sampler_config.params.num_steps = num_steps
-    config.model.params.sampler_config.params.guider_config.params.num_frames = (
-        num_frames
-    )
-    if device == "cuda":
-        with torch.device(device):
-            model = instantiate_from_config(config.model).to(device).eval()
-    else:
-        model = instantiate_from_config(config.model).to(device).eval()
-    filter = DeepFloydDataFiltering(verbose=False, device=device)
-    return model, filter
-model, filter = load_model(
-    model_config,
-    device,
-    num_frames,
-    num_steps,
-)
 def sample(
     input_path: str,
     num_frames: Optional[int] = 25,
@@ -74,139 +44,8 @@ def sample(
     decoding_t: int = 7,  # Number of frames decoded at a time! This eats most VRAM. Reduce if necessary.
 ):
     output_folder = str(uuid.uuid4())
-    torch.manual_seed(seed)
-    path = Path(input_path)
-    all_img_paths = []
-    if path.is_file():
-        if any([input_path.endswith(x) for x in ["jpg", "jpeg", "png"]]):
-            all_img_paths = [input_path]
-        else:
-            raise ValueError("Path is not valid image file.")
-    elif path.is_dir():
-        all_img_paths = sorted(
-            [
-                f
-                for f in path.iterdir()
-                if f.is_file() and f.suffix.lower() in [".jpg", ".jpeg", ".png"]
-            ]
-        )
-        if len(all_img_paths) == 0:
-            raise ValueError("Folder does not contain any images.")
-    else:
-        raise ValueError
-    for input_img_path in all_img_paths:
-        with Image.open(input_img_path) as image:
-            if image.mode == "RGBA":
-                image = image.convert("RGB")
-            w, h = image.size
-            if h % 64 != 0 or w % 64 != 0:
-                width, height = map(lambda x: x - x % 64, (w, h))
-                image = image.resize((width, height))
-                print(
-                    f"WARNING: Your image is of size {h}x{w} which is not divisible by 64. We are resizing to {height}x{width}!"
-                )
-            image = ToTensor()(image)
-            image = image * 2.0 - 1.0
-            image = image.unsqueeze(0).to(device)
-            H, W = image.shape[2:]
-            assert image.shape[1] == 3
-            F = 8
-            C = 4
-            shape = (num_frames, C, H // F, W // F)
-            if (H, W) != (576, 1024):
-                print(
-                    "WARNING: The conditioning frame you provided is not 576x1024. This leads to suboptimal performance as model was only trained on 576x1024. Consider increasing `cond_aug`."
-                )
-            if motion_bucket_id > 255:
-                print(
-                    "WARNING: High motion bucket! This may lead to suboptimal performance."
-                )
-            if fps_id < 5:
-                print("WARNING: Small fps value! This may lead to suboptimal performance.")
-            if fps_id > 30:
-                print("WARNING: Large fps value! This may lead to suboptimal performance.")
-            value_dict = {}
-            value_dict["motion_bucket_id"] = motion_bucket_id
-            value_dict["fps_id"] = fps_id
-            value_dict["cond_aug"] = cond_aug
-            value_dict["cond_frames_without_noise"] = image
-            value_dict["cond_frames"] = image + cond_aug * torch.randn_like(image)
-            value_dict["cond_aug"] = cond_aug
-            with torch.no_grad():
-                with torch.autocast(device):
-                    batch, batch_uc = get_batch(
-                        get_unique_embedder_keys_from_conditioner(model.conditioner),
-                        value_dict,
-                        [1, num_frames],
-                        T=num_frames,
-                        device=device,
-                    )
-                    c, uc = model.conditioner.get_unconditional_conditioning(
-                        batch,
-                        batch_uc=batch_uc,
-                        force_uc_zero_embeddings=[
-                            "cond_frames",
-                            "cond_frames_without_noise",
-                        ],
-                    )
-                    for k in ["crossattn", "concat"]:
-                        uc[k] = repeat(uc[k], "b ... -> b t ...", t=num_frames)
-                        uc[k] = rearrange(uc[k], "b t ... -> (b t) ...", t=num_frames)
-                        c[k] = repeat(c[k], "b ... -> b t ...", t=num_frames)
-                        c[k] = rearrange(c[k], "b t ... -> (b t) ...", t=num_frames)
-                    randn = torch.randn(shape, device=device)
-                    additional_model_inputs = {}
-                    additional_model_inputs["image_only_indicator"] = torch.zeros(
-                        2, num_frames
-                    ).to(device)
-                    additional_model_inputs["num_video_frames"] = batch["num_video_frames"]
-                    def denoiser(input, sigma, c):
-                        return model.denoiser(
-                            model.model, input, sigma, c, **additional_model_inputs
-                        )
-                    samples_z = model.sampler(denoiser, randn, cond=c, uc=uc)
-                    model.en_and_decode_n_samples_a_time = decoding_t
-                    samples_x = model.decode_first_stage(samples_z)
-                    samples = torch.clamp((samples_x + 1.0) / 2.0, min=0.0, max=1.0)
-                    os.makedirs(output_folder, exist_ok=True)
-                    base_count = len(glob(os.path.join(output_folder, "*.mp4")))
-                    video_path = os.path.join(output_folder, f"{base_count:06d}.mp4")
-                    writer = cv2.VideoWriter(
-                        video_path,
-                        cv2.VideoWriter_fourcc(*"MP4V"),
-                        fps_id + 1,
-                        (samples.shape[-1], samples.shape[-2]),
-                    )
-                    samples = embed_watermark(samples)
-                    samples = filter(samples)
-                    vid = (
-                        (rearrange(samples, "t c h w -> t h w c") * 255)
-                        .cpu()
-                        .numpy()
-                        .astype(np.uint8)
-                    )
-                    for frame in vid:
-                        frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
-                        writer.write(frame)
-                    writer.release()
-        return video_path
 def get_unique_embedder_keys_from_conditioner(conditioner):
     return list(set([x.input_key for x in conditioner.embedders]))

 from sgm.util import default, instantiate_from_config
 from huggingface_hub import hf_hub_download
+from simple_video_sample import sample
 num_frames = 25
 num_steps = 30
 model_config = "scripts/sampling/configs/svd_xt.yaml"
 .gradio-container{max-width:850px !important}
 '''
 def sample(
     input_path: str,
     num_frames: Optional[int] = 25,
     decoding_t: int = 7,  # Number of frames decoded at a time! This eats most VRAM. Reduce if necessary.
 ):
     output_folder = str(uuid.uuid4())
+    sample(input_path, version, output_folder, decoding_t)
+    return f"{output_folder}/000000.mp4"
 def get_unique_embedder_keys_from_conditioner(conditioner):
     return list(set([x.input_key for x in conditioner.embedders]))