Spaces:

tedlasai
/

blur2vid

Running on Zero

App Files Files Community

tedlasai commited on 20 days ago

Commit

6a2f159

1 Parent(s): 6a1328e

fixed full pipeline and added options

Browse files

Files changed (2) hide show

gradio/app.py +77 -26
inference.py +14 -6

gradio/app.py CHANGED Viewed

@@ -19,6 +19,8 @@ args.pretrained_model_path = "THUDM/CogVideoX-2b"
 args.model_config_path = "training/configs/outsidephotos.yaml"
 args.video_width = 1280
 args.video_height = 720
 args.seed = None
 pipe, model_config = load_model(args)
@@ -27,40 +29,62 @@ OUTPUT_DIR = Path("/tmp/generated_videos")
 OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
-@spaces.GPU
-def generate_video_from_image(image: Image.Image) -> str:
     print("Generating video")
     video_id = uuid.uuid4().hex
     output_path = OUTPUT_DIR / f"{video_id}.mp4"
     args.device = "cuda"
-    processed_image, video = inference_on_image(pipe, image, "past_present_and_future", model_config, args)
     export_to_video(video, output_path, fps=20)
-    return str(output_path)
-def demo_predict(image: Image.Image) -> str:
-    """
-    Wrapper for Gradio. Takes an image and returns a video path.
-    """
-    if image is None:
-        raise gr.Error("Please upload an image first.")
-    video_path = generate_video_from_image(image)
-    if not os.path.exists(video_path):
         raise gr.Error("Video generation failed: output file not found.")
-    return video_path
 with gr.Blocks(css="footer {visibility: hidden}") as demo:
     gr.Markdown(
         """
-        # 🖼️ ➜ 🎬 Recover motion from a blurry image!
-        Upload an image and the model will generate a short video.
         """
     )
@@ -71,24 +95,51 @@ with gr.Blocks(css="footer {visibility: hidden}") as demo:
                 label="Input image",
                 interactive=True,
             )
-            tense_choice = gr.Dropdown(
-                label="I want to generate the",
-                choices=["present", "past, present and future"],
-                value="past, present and future",           # default selection
-                interactive=True,
             )
             generate_btn = gr.Button("Generate video", variant="primary")
         with gr.Column():
             video_out = gr.Video(
                 label="Generated video",
-                format="mp4",  # ensures browser-friendly output
                 autoplay=True,
                 loop=True,
             )
     generate_btn.click(
-        fn=demo_predict,
-        inputs=image_in,
         outputs=video_out,
         api_name="predict",
     )

 args.model_config_path = "training/configs/outsidephotos.yaml"
 args.video_width = 1280
 args.video_height = 720
+# args.video_width = 960
+# args.video_height = 540
 args.seed = None
 pipe, model_config = load_model(args)
 OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+@spaces.GPU(timeout=300)
+def generate_video_from_image(image: Image.Image, interval_key: str, orientation_mode: str, num_inference_steps: int) -> str:
+    """
+    Wrapper for Gradio. Takes an image and returns a video path.
+    """
+    if image is None:
+        raise gr.Error("Please upload an image first.")
     print("Generating video")
+    import torch
+    print("CUDA:", torch.cuda.is_available())
+    print("Device:", torch.cuda.get_device_name(0))
+    print("bf16 supported:", torch.cuda.is_bf16_supported())
+    if orientation_mode == "Landscape (1280×720)":
+        print("Chosing resolution 1280×720 (landscape)")
+        args.video_width = 1280
+        args.video_height = 720
+    elif orientation_mode == "Portrait (720×1280)":
+        print("Choosing resolution 720×1280 (portrait)")
+        args.video_height = 1280
+        args.video_width = 720
+    else:
+        print("Unknown orientation mode", orientation_mode, "defaulting to 1280x720")
+        args.video_width = 1280
+        args.video_height = 720
+    args.num_inference_steps = num_inference_steps
     video_id = uuid.uuid4().hex
     output_path = OUTPUT_DIR / f"{video_id}.mp4"
     args.device = "cuda"
+    pipe.to(args.device)
+    processed_image, video = inference_on_image(pipe, image, interval_key, model_config, args)
     export_to_video(video, output_path, fps=20)
+    if not os.path.exists(output_path):
         raise gr.Error("Video generation failed: output file not found.")
+    return str(output_path)
 with gr.Blocks(css="footer {visibility: hidden}") as demo:
     gr.Markdown(
         """
+        # 🖼️ ➜ 🎬 Recover Motion from a Blurry Image
+        This demo accompanies the paper **“Generating the Past, Present, and Future from a Motion-Blurred Image”**
+        by Tedla *et al.*, ACM Transactions on Graphics (SIGGRAPH Asia 2025).
+        - 🌐 **Project page:** <https://blur2vid.github.io/>
+        - 💻 **Code:** <https://github.com/tedlasai/blur2vid/>
+        Upload a blurry image and the model will generate a short video containing the recovered motion depending on your selection.
         """
     )
                 label="Input image",
                 interactive=True,
             )
+            with gr.Row():
+                tense_choice = gr.Radio(
+                    label="Select the interval to be generated:",
+                    choices=["present", "past, present and future"],
+                    value="past, present and future",
+                    interactive=True,
+                )
+            with gr.Row():
+                mode_choice = gr.Radio(
+                    label="Orientation",
+                    choices=["Landscape (1280×720)", "Portrait (720×1280)"],
+                    value="Landscape (1280×720)",
+                    interactive=True,
+                )
+            gr.Markdown(
+                "<span style='font-size: 12px; color: gray;'>"
+                "Note: Model was trained on 1280×720 (Landscape). Portrait mode will degrade performance."
+                "</span>"
+            )
+            num_inference_steps = gr.Slider(
+                label="Number of inference steps",
+                minimum=4,
+                maximum=50,
+                step=1,
+                value=20,
+                info="More steps = better quality but slower",
             )
             generate_btn = gr.Button("Generate video", variant="primary")
         with gr.Column():
             video_out = gr.Video(
                 label="Generated video",
+                format="mp4",
                 autoplay=True,
                 loop=True,
             )
     generate_btn.click(
+        fn=generate_video_from_image,
+        inputs=[image_in, tense_choice, mode_choice, num_inference_steps],   # ← include tense_choice!
         outputs=video_out,
         api_name="predict",
     )

inference.py CHANGED Viewed

@@ -122,6 +122,7 @@ def load_model(args):
         revision=model_config["revision"],
         variant=model_config["variant"],
         low_cpu_mem_usage=False,
     )
     weight_path = hf_hub_download(
         repo_id=args.blur2vid_hf_repo_path,
@@ -159,11 +160,12 @@ def load_model(args):
     # For mixed precision training we cast all non-trainable weights (vae, text_encoder and transformer) to half-precision
     # as these weights are only used for inference, keeping weights in full precision is not required.
-    weight_dtype = torch.bfloat16
-    # text_encoder.to(dtype=weight_dtype)
-    # transformer.to(dtype=weight_dtype)
-    # vae.to(dtype=weight_dtype)
     pipe = ControlnetCogVideoXPipeline.from_pretrained(
         args.pretrained_model_path,
@@ -199,7 +201,7 @@ def inference_on_image(pipe, image, interval_key, model_config, args):
     # run inference
     generator = torch.Generator(device=args.device).manual_seed(args.seed) if args.seed else None
-    with torch.autocast(args.device, enabled=True):
         batch = convert_to_batch(image, interval_key, (args.video_height, args.video_width))
         frame = batch["blur_img"].permute(0, 2, 3, 1).cpu().numpy()
@@ -216,7 +218,7 @@ def inference_on_image(pipe, image, interval_key, model_config, args):
             "height": batch["height"],
             "width": batch["width"],
             "num_frames": torch.tensor([[model_config["max_num_frames"]]]), # torch.tensor([[batch["num_frames"]]]),
-            "num_inference_steps": model_config["num_inference_steps"],
         }
         input_image = frame
@@ -305,6 +307,12 @@ if __name__ == "__main__":
         default=720,
         help="video resolution height",
     )
     parser.add_argument(
         "--seed",
         type=int,

         revision=model_config["revision"],
         variant=model_config["variant"],
         low_cpu_mem_usage=False,
+        attn_implementation="flash_attention_2",
     )
     weight_path = hf_hub_download(
         repo_id=args.blur2vid_hf_repo_path,
     # For mixed precision training we cast all non-trainable weights (vae, text_encoder and transformer) to half-precision
     # as these weights are only used for inference, keeping weights in full precision is not required.
+    # Somehow for HF Spaces we do need to keep them in full precision
+    weight_dtype = torch.bfloat16  # torch.float32  # torch.bfloat16
+    text_encoder.to(dtype=weight_dtype)
+    transformer.to(dtype=weight_dtype)
+    vae.to(dtype=weight_dtype)
     pipe = ControlnetCogVideoXPipeline.from_pretrained(
         args.pretrained_model_path,
     # run inference
     generator = torch.Generator(device=args.device).manual_seed(args.seed) if args.seed else None
+    with torch.autocast(device_type=args.device, dtype=torch.bfloat16, enabled=True):
         batch = convert_to_batch(image, interval_key, (args.video_height, args.video_width))
         frame = batch["blur_img"].permute(0, 2, 3, 1).cpu().numpy()
             "height": batch["height"],
             "width": batch["width"],
             "num_frames": torch.tensor([[model_config["max_num_frames"]]]), # torch.tensor([[batch["num_frames"]]]),
+            "num_inference_steps": args.num_inference_steps,
         }
         input_image = frame
         default=720,
         help="video resolution height",
     )
+    parser.add_argument(
+        "--num_inference_steps",
+        type=int,
+        default=50,
+        help="number of DDIM steps",
+    )
     parser.add_argument(
         "--seed",
         type=int,