Spaces:

MyNiuuu
/

MOFA-Video_Traj

Running on Zero

App Files Files Community

myniu commited on Jul 3

Commit

e9f1b91

•

1 Parent(s): bfb52d0

init

Browse files

Files changed (1) hide show

app.py +77 -82

app.py CHANGED Viewed

@@ -89,79 +89,6 @@ def get_sparseflow_and_mask_forward(
     return s_flow, mask
-@spaces.GPU(duration=200)
-def init_models(pretrained_model_name_or_path, resume_from_checkpoint, weight_dtype, device='cuda', enable_xformers_memory_efficient_attention=False, allow_tf32=False):
-    from models.unet_spatio_temporal_condition_controlnet import UNetSpatioTemporalConditionControlNetModel
-    from pipeline.pipeline import FlowControlNetPipeline
-    from models.svdxt_featureflow_forward_controlnet_s2d_fixcmp_norefine import FlowControlNet, CMP_demo
-    print('start loading models...')
-    # Load scheduler, tokenizer and models.
-    image_encoder = CLIPVisionModelWithProjection.from_pretrained(
-        pretrained_model_name_or_path, subfolder="image_encoder", revision=None, variant="fp16"
-    )
-    vae = AutoencoderKLTemporalDecoder.from_pretrained(
-        pretrained_model_name_or_path, subfolder="vae", revision=None, variant="fp16")
-    unet = UNetSpatioTemporalConditionControlNetModel.from_pretrained(
-        pretrained_model_name_or_path,
-        subfolder="unet",
-        low_cpu_mem_usage=True,
-        variant="fp16",
-    )
-    controlnet = FlowControlNet.from_pretrained(resume_from_checkpoint)
-    cmp = CMP_demo(
-        './models/cmp/experiments/semiauto_annot/resnet50_vip+mpii_liteflow/config.yaml',
-        42000
-    ).to(device)
-    cmp.requires_grad_(False)
-    # Freeze vae and image_encoder
-    vae.requires_grad_(False)
-    image_encoder.requires_grad_(False)
-    unet.requires_grad_(False)
-    controlnet.requires_grad_(False)
-    # Move image_encoder and vae to gpu and cast to weight_dtype
-    image_encoder.to(device, dtype=weight_dtype)
-    vae.to(device, dtype=weight_dtype)
-    unet.to(device, dtype=weight_dtype)
-    controlnet.to(device, dtype=weight_dtype)
-    if enable_xformers_memory_efficient_attention:
-        if is_xformers_available():
-            import xformers
-            xformers_version = version.parse(xformers.__version__)
-            if xformers_version == version.parse("0.0.16"):
-                print(
-                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
-                )
-            unet.enable_xformers_memory_efficient_attention()
-        else:
-            raise ValueError(
-                "xformers is not available. Make sure it is installed correctly")
-    if allow_tf32:
-        torch.backends.cuda.matmul.allow_tf32 = True
-    pipeline = FlowControlNetPipeline.from_pretrained(
-        pretrained_model_name_or_path,
-        unet=unet,
-        controlnet=controlnet,
-        image_encoder=image_encoder,
-        vae=vae,
-        torch_dtype=weight_dtype,
-    )
-    pipeline = pipeline.to(device)
-    print('models loaded.')
-    return pipeline, cmp
 def interpolate_trajectory(points, n_points):
     x = [point[0] for point in points]
     y = [point[1] for point in points]
@@ -236,15 +163,8 @@ with gr.Blocks() as demo:
     )
     height, width = 512, 512
-    svd_ckpt = "ckpts/stable-video-diffusion-img2vid-xt-1-1"
-    mofa_ckpt = "ckpts/controlnet"
-    pipeline, cmp = init_models(
-        svd_ckpt,
-        mofa_ckpt,
-        weight_dtype=torch.float16,
-        device='cuda'
-    )
     first_frame_path = gr.State()
     tracking_points = gr.State([])
@@ -253,6 +173,78 @@ with gr.Blocks() as demo:
     motion_brush_viz = gr.State()
     inference_batch_size = gr.State(1)
     def get_cmp_flow(frames, sparse_optical_flow, mask, brush_mask=None):
         '''
@@ -652,8 +644,11 @@ with gr.Blocks() as demo:
         return hint_path, outputs_path, flows_path, outputs_mp4_path, flows_mp4_path
     def preprocess_image(image):
         image_pil = image2pil(image.name)
         raw_w, raw_h = image_pil.size

     return s_flow, mask
 def interpolate_trajectory(points, n_points):
     x = [point[0] for point in points]
     y = [point[1] for point in points]
     )
     height, width = 512, 512
+    pipeline, cmp = None, None
     first_frame_path = gr.State()
     tracking_points = gr.State([])
     motion_brush_viz = gr.State()
     inference_batch_size = gr.State(1)
+    @spaces.GPU(duration=100)
+    def init_models(pretrained_model_name_or_path="ckpts/stable-video-diffusion-img2vid-xt-1-1", resume_from_checkpoint="ckpts/controlnet", weight_dtype=torch.float16, device='cuda', enable_xformers_memory_efficient_attention=False, allow_tf32=False):
+        from models.unet_spatio_temporal_condition_controlnet import UNetSpatioTemporalConditionControlNetModel
+        from pipeline.pipeline import FlowControlNetPipeline
+        from models.svdxt_featureflow_forward_controlnet_s2d_fixcmp_norefine import FlowControlNet, CMP_demo
+        print('start loading models...')
+        # Load scheduler, tokenizer and models.
+        image_encoder = CLIPVisionModelWithProjection.from_pretrained(
+            pretrained_model_name_or_path, subfolder="image_encoder", revision=None, variant="fp16"
+        )
+        vae = AutoencoderKLTemporalDecoder.from_pretrained(
+            pretrained_model_name_or_path, subfolder="vae", revision=None, variant="fp16")
+        unet = UNetSpatioTemporalConditionControlNetModel.from_pretrained(
+            pretrained_model_name_or_path,
+            subfolder="unet",
+            low_cpu_mem_usage=True,
+            variant="fp16",
+        )
+        controlnet = FlowControlNet.from_pretrained(resume_from_checkpoint)
+        cmp = CMP_demo(
+            './models/cmp/experiments/semiauto_annot/resnet50_vip+mpii_liteflow/config.yaml',
+            42000
+        ).to(device)
+        cmp.requires_grad_(False)
+        # Freeze vae and image_encoder
+        vae.requires_grad_(False)
+        image_encoder.requires_grad_(False)
+        unet.requires_grad_(False)
+        controlnet.requires_grad_(False)
+        # Move image_encoder and vae to gpu and cast to weight_dtype
+        image_encoder.to(device, dtype=weight_dtype)
+        vae.to(device, dtype=weight_dtype)
+        unet.to(device, dtype=weight_dtype)
+        controlnet.to(device, dtype=weight_dtype)
+        if enable_xformers_memory_efficient_attention:
+            if is_xformers_available():
+                import xformers
+                xformers_version = version.parse(xformers.__version__)
+                if xformers_version == version.parse("0.0.16"):
+                    print(
+                        "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
+                    )
+                unet.enable_xformers_memory_efficient_attention()
+            else:
+                raise ValueError(
+                    "xformers is not available. Make sure it is installed correctly")
+        if allow_tf32:
+            torch.backends.cuda.matmul.allow_tf32 = True
+        pipeline = FlowControlNetPipeline.from_pretrained(
+            pretrained_model_name_or_path,
+            unet=unet,
+            controlnet=controlnet,
+            image_encoder=image_encoder,
+            vae=vae,
+            torch_dtype=weight_dtype,
+        )
+        pipeline = pipeline.to(device)
+        print('models loaded.')
+        return pipeline, cmp
     def get_cmp_flow(frames, sparse_optical_flow, mask, brush_mask=None):
         '''
         return hint_path, outputs_path, flows_path, outputs_mp4_path, flows_mp4_path
+    @spaces.GPU(duration=100)
     def preprocess_image(image):
+        pipeline, cmp = init_models()
         image_pil = image2pil(image.name)
         raw_w, raw_h = image_pil.size