evf-sam2

Running on Zero

App Files Files Community

wondervictor commited on 15 days ago

Commit

cddba21

•

1 Parent(s): 578b68a

Update app.py

Browse files

Files changed (1) hide show

app.py +21 -5

app.py CHANGED Viewed

@@ -27,7 +27,7 @@ import numpy as np
 import sys
 import tqdm
-version = "YxZhang/evf-sam2"
 model_type = "sam2"
 tokenizer = AutoTokenizer.from_pretrained(
@@ -58,7 +58,7 @@ video_model.to('cuda')
 @spaces.GPU
 @torch.no_grad()
-def inference_image(image_np, prompt):
     original_size_list = [image_np.shape[:2]]
     image_beit = beit3_preprocess(image_np, 224).to(dtype=image_model.dtype,
@@ -68,6 +68,8 @@ def inference_image(image_np, prompt):
     image_sam = image_sam.to(dtype=image_model.dtype,
                              device=image_model.device)
     input_ids = tokenizer(
         prompt, return_tensors="pt")["input_ids"].to(device=image_model.device)
@@ -93,7 +95,7 @@ def inference_image(image_np, prompt):
 @spaces.GPU
 @torch.no_grad()
 @torch.autocast(device_type="cuda", dtype=torch.float16)
-def inference_video(video_path, prompt):
     os.system("rm -rf demo_temp")
     os.makedirs("demo_temp/input_frames", exist_ok=True)
@@ -109,6 +111,8 @@ def inference_video(video_path, prompt):
     image_beit = beit3_preprocess(image_np, 224).to(dtype=video_model.dtype,
                                                     device=video_model.device)
     input_ids = tokenizer(
         prompt, return_tensors="pt")["input_ids"].to(device=video_model.device)
@@ -162,6 +166,12 @@ with gr.Blocks() as demo:
             submit_image = gr.Button(value='Submit',
                                      scale=1,
                                      variant='primary')
     with gr.Tab(label="EVF-SAM-2-Video"):
         with gr.Row():
             input_video = gr.Video(label='Input Video')
@@ -175,11 +185,17 @@ with gr.Blocks() as demo:
             submit_video = gr.Button(value='Submit',
                                      scale=1,
                                      variant='primary')
     submit_image.click(fn=inference_image,
-                       inputs=[input_image, image_prompt],
                        outputs=output_image)
     submit_video.click(fn=inference_video,
-                       inputs=[input_video, video_prompt],
                        outputs=output_video)
 demo.launch(show_error=True)

 import sys
 import tqdm
+version = "YxZhang/evf-sam2-multitask"
 model_type = "sam2"
 tokenizer = AutoTokenizer.from_pretrained(
 @spaces.GPU
 @torch.no_grad()
+def inference_image(image_np, prompt, semantic_type):
     original_size_list = [image_np.shape[:2]]
     image_beit = beit3_preprocess(image_np, 224).to(dtype=image_model.dtype,
     image_sam = image_sam.to(dtype=image_model.dtype,
                              device=image_model.device)
+    if semantic_type:
+        prompt = "[semantic] " + prompt
     input_ids = tokenizer(
         prompt, return_tensors="pt")["input_ids"].to(device=image_model.device)
 @spaces.GPU
 @torch.no_grad()
 @torch.autocast(device_type="cuda", dtype=torch.float16)
+def inference_video(video_path, prompt, semantic_type):
     os.system("rm -rf demo_temp")
     os.makedirs("demo_temp/input_frames", exist_ok=True)
     image_beit = beit3_preprocess(image_np, 224).to(dtype=video_model.dtype,
                                                     device=video_model.device)
+    if semantic_type:
+        prompt = "[semantic] " + prompt
     input_ids = tokenizer(
         prompt, return_tensors="pt")["input_ids"].to(device=video_model.device)
             submit_image = gr.Button(value='Submit',
                                      scale=1,
                                      variant='primary')
+        with gr.Row():
+            semantic_type_img = gr.Checkbox(
+                False,
+                label="semantic level",
+                info="check this if you want to segment body parts or background or multi objects (only available with latest evf-sam checkpoint)"
+            )
     with gr.Tab(label="EVF-SAM-2-Video"):
         with gr.Row():
             input_video = gr.Video(label='Input Video')
             submit_video = gr.Button(value='Submit',
                                      scale=1,
                                      variant='primary')
+        with gr.Row():
+            semantic_type_vid = gr.Checkbox(
+                False,
+                label="semantic level",
+                info="check this if you want to segment body parts or background or multi objects (only available with latest evf-sam checkpoint)"
+            )
     submit_image.click(fn=inference_image,
+                       inputs=[input_image, image_prompt, semantic_type_img],
                        outputs=output_image)
     submit_video.click(fn=inference_video,
+                       inputs=[input_video, video_prompt, semantic_type_vid],
                        outputs=output_video)
 demo.launch(show_error=True)