Spaces:

VIPLab
/

Track-Anything

Runtime error

App Files Files Community

watchtowerss commited on Apr 25, 2023

Commit

bb879e5

1 Parent(s): 54438f1

add inpaint and example

Browse files

Files changed (28) hide show

.gitattributes +7 -0
app.py +114 -28
assets/avengers.gif +3 -0
assets/track-anything-logo.jpg +0 -0
checkpoints/E2FGVI-HQ-CVPR22.pth +3 -0
inpainter/.DS_Store +0 -0
inpainter/base_inpainter.py +6 -2
inpainter/model/e2fgvi_hq.py +4 -4
inpainter/model/modules/feat_prop.py +1 -1
overleaf/.DS_Store +0 -0
overleaf/Track Anything.zip +3 -0
overleaf/Track Anything/figs/avengers_1.pdf +3 -0
overleaf/Track Anything/figs/davisresults.pdf +3 -0
overleaf/Track Anything/figs/failedcases.pdf +3 -0
overleaf/Track Anything/figs/overview_4.pdf +0 -0
overleaf/Track Anything/neurips_2022.bbl +105 -0
overleaf/Track Anything/neurips_2022.bib +187 -0
overleaf/Track Anything/neurips_2022.sty +381 -0
overleaf/Track Anything/neurips_2022.tex +378 -0
requirements.txt +3 -0
test_sample/test-sample13.mp4 +3 -0
test_sample/test-sample2.mp4 +0 -0
test_sample/test-sample4.mp4 +3 -0
test_sample/test-sample8.mp4 +3 -0
track_anything.py +4 -6
tracker/.DS_Store +0 -0
tracker/base_tracker.py +50 -36
tracker/inference/inference_core.py +1 -1

.gitattributes CHANGED Viewed

@@ -36,3 +36,10 @@ assets/demo_version_1.MP4 filter=lfs diff=lfs merge=lfs -text
 assets/inpainting.gif filter=lfs diff=lfs merge=lfs -text
 assets/qingming.mp4 filter=lfs diff=lfs merge=lfs -text
 test_sample/test-sample1.mp4 filter=lfs diff=lfs merge=lfs -text

 assets/inpainting.gif filter=lfs diff=lfs merge=lfs -text
 assets/qingming.mp4 filter=lfs diff=lfs merge=lfs -text
 test_sample/test-sample1.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/avengers.gif filter=lfs diff=lfs merge=lfs -text
+overleaf/Track[[:space:]]Anything/figs/avengers_1.pdf filter=lfs diff=lfs merge=lfs -text
+overleaf/Track[[:space:]]Anything/figs/davisresults.pdf filter=lfs diff=lfs merge=lfs -text
+overleaf/Track[[:space:]]Anything/figs/failedcases.pdf filter=lfs diff=lfs merge=lfs -text
+test_sample/test-sample13.mp4 filter=lfs diff=lfs merge=lfs -text
+test_sample/test-sample4.mp4 filter=lfs diff=lfs merge=lfs -text
+test_sample/test-sample8.mp4 filter=lfs diff=lfs merge=lfs -text

app.py CHANGED Viewed

@@ -1,8 +1,7 @@
 import gradio as gr
 import argparse
 import cv2
-import time
-from PIL import Image
 import numpy as np
 import os
 import sys
@@ -14,9 +13,8 @@ import requests
 import json
 import torchvision
 import torch
-import concurrent.futures
-import queue
-from tools.painter import mask_painter, point_painter
 # download checkpoints
 def download_checkpoint(url, folder, filename):
     os.makedirs(folder, exist_ok=True)
@@ -34,6 +32,19 @@ def download_checkpoint(url, folder, filename):
     return filepath
 # convert points input to prompt state
 def get_prompt(click_state, click_input):
     inputs = json.loads(click_input)
@@ -74,18 +85,18 @@ def get_frames_from_video(video_input, video_state):
                 break
     except (OSError, TypeError, ValueError, KeyError, SyntaxError) as e:
         print("read_frame_source:{} error. {}\n".format(video_path, str(e)))
     # initialize video_state
     video_state = {
         "video_name": os.path.split(video_path)[-1],
         "origin_images": frames,
         "painted_images": frames.copy(),
-        "masks": [None]*len(frames),
         "logits": [None]*len(frames),
         "select_frame_number": 0,
         "fps": fps
         }
-    video_info = "Video Name: {}, FPS: {}, Total Frames: {}".format(video_state["video_name"], video_state["fps"], len(frames))
     model.samcontroler.sam_controler.reset_image()
     model.samcontroler.sam_controler.set_image(video_state["origin_images"][0])
@@ -94,8 +105,10 @@ def get_frames_from_video(video_input, video_state):
                         gr.update(visible=True), gr.update(visible=True), \
                         gr.update(visible=True), gr.update(visible=True), \
                         gr.update(visible=True), gr.update(visible=True), \
-                        gr.update(visible=True)
 # get the select frame from gradio slider
 def select_template(image_selection_slider, video_state, interactive_state):
@@ -108,13 +121,22 @@ def select_template(image_selection_slider, video_state, interactive_state):
     model.samcontroler.sam_controler.reset_image()
     model.samcontroler.sam_controler.set_image(video_state["origin_images"][image_selection_slider])
-    # # clear multi mask
-    # interactive_state["multi_mask"] = {"masks":[], "mask_names":[]}
     return video_state["painted_images"][image_selection_slider], video_state, interactive_state
-def get_end_number(track_pause_number_slider, interactive_state):
     interactive_state["track_end_number"] = track_pause_number_slider
     return interactive_state
 # use sam to get the mask
@@ -207,7 +229,7 @@ def vos_tracking_video(video_state, interactive_state, mask_dropdown):
         video_state["logits"][video_state["select_frame_number"]:] = logits
         video_state["painted_images"][video_state["select_frame_number"]:] = painted_images
-    video_output = generate_video_from_frames(video_state["painted_images"], output_path="./result/{}".format(video_state["video_name"]), fps=fps) # import video_input to name the output video
     interactive_state["inference_times"] += 1
     print("For generating this tracking result, inference times: {}, click times: {}, positive: {}, negative: {}".format(interactive_state["inference_times"],
@@ -228,6 +250,36 @@ def vos_tracking_video(video_state, interactive_state, mask_dropdown):
     #### shanggao code for mask save
     return video_output, video_state, interactive_state
 # generate video after vos inference
 def generate_video_from_frames(frames, output_path, fps=30):
     """
@@ -257,17 +309,21 @@ SAM_checkpoint = "sam_vit_h_4b8939.pth"
 sam_checkpoint_url = "https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth"
 xmem_checkpoint = "XMem-s012.pth"
 xmem_checkpoint_url = "https://github.com/hkchengrex/XMem/releases/download/v1.0/XMem-s012.pth"
 folder ="./checkpoints"
 SAM_checkpoint = download_checkpoint(sam_checkpoint_url, folder, SAM_checkpoint)
 xmem_checkpoint = download_checkpoint(xmem_checkpoint_url, folder, xmem_checkpoint)
 # args, defined in track_anything.py
 args = parse_augment()
 # args.port = 12315
-# args.device = "cuda:1"
 # args.mask_save = True
-model = TrackingAnything(SAM_checkpoint, xmem_checkpoint, args)
 with gr.Blocks() as iface:
     """
@@ -283,7 +339,8 @@ with gr.Blocks() as iface:
             "mask_names": [],
             "masks": []
         },
-        "track_end_number": None
     }
     )
@@ -293,6 +350,7 @@ with gr.Blocks() as iface:
         "origin_images": None,
         "painted_images": None,
         "masks": None,
         "logits": None,
         "select_frame_number": 0,
         "fps": 30
@@ -305,8 +363,11 @@ with gr.Blocks() as iface:
         with gr.Column():
             with gr.Row(scale=0.4):
                 video_input = gr.Video(autosize=True)
-                video_info = gr.Textbox()
             with gr.Row():
@@ -342,7 +403,9 @@ with gr.Blocks() as iface:
                     mask_dropdown = gr.Dropdown(multiselect=True, value=[], label="Mask_select", info=".", visible=False)
                     remove_mask_button = gr.Button(value="Remove mask", interactive=True, visible=False)
                     video_output = gr.Video(autosize=True, visible=False).style(height=360)
-                    tracking_video_predict_button = gr.Button(value="Tracking", visible=False)
     # first step: get the video information
     extract_frames_button.click(
@@ -352,7 +415,7 @@ with gr.Blocks() as iface:
         ],
         outputs=[video_state, video_info, template_frame,
                  image_selection_slider, track_pause_number_slider,point_prompt, click_mode, clear_button_click, Add_mask_button, template_frame,
-                 tracking_video_predict_button, video_output, mask_dropdown, remove_mask_button]
     )
     # second step: select images from slider
@@ -360,8 +423,11 @@ with gr.Blocks() as iface:
                                    inputs=[image_selection_slider, video_state, interactive_state],
                                    outputs=[template_frame, video_state, interactive_state], api_name="select_image")
     track_pause_number_slider.release(fn=get_end_number,
-                                   inputs=[track_pause_number_slider, interactive_state],
-                                   outputs=[interactive_state], api_name="end_image")
     # click select image to get mask using sam
     template_frame.select(
@@ -390,6 +456,13 @@ with gr.Blocks() as iface:
         outputs=[video_output, video_state, interactive_state]
     )
     # click to get mask
     mask_dropdown.change(
         fn=show_mask,
@@ -404,6 +477,7 @@ with gr.Blocks() as iface:
         "origin_images": None,
         "painted_images": None,
         "masks": None,
         "logits": None,
         "select_frame_number": 0,
         "fps": 30
@@ -417,14 +491,15 @@ with gr.Blocks() as iface:
             "mask_names": [],
             "masks": []
         },
-        "track_end_number": 0
         },
         [[],[]],
         None,
         None,
         gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), \
         gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), \
-        gr.update(visible=False), gr.update(visible=False), gr.update(visible=False, value=[]), gr.update(visible=False) \
         ),
         [],
@@ -435,7 +510,7 @@ with gr.Blocks() as iface:
             video_output,
             template_frame,
             tracking_video_predict_button, image_selection_slider , track_pause_number_slider,point_prompt, click_mode, clear_button_click,
-            Add_mask_button, template_frame, tracking_video_predict_button, video_output, mask_dropdown, remove_mask_button
         ],
         queue=False,
         show_progress=False)
@@ -445,10 +520,21 @@ with gr.Blocks() as iface:
         fn = clear_click,
         inputs = [video_state, click_state,],
         outputs = [template_frame,click_state],
     )
 iface.queue(concurrency_count=1)
-iface.launch(enable_queue=True)

 import gradio as gr
 import argparse
+import gdown
 import cv2
 import numpy as np
 import os
 import sys
 import json
 import torchvision
 import torch
+from tools.painter import mask_painter
 # download checkpoints
 def download_checkpoint(url, folder, filename):
     os.makedirs(folder, exist_ok=True)
     return filepath
+def download_checkpoint_from_google_drive(file_id, folder, filename):
+    os.makedirs(folder, exist_ok=True)
+    filepath = os.path.join(folder, filename)
+    if not os.path.exists(filepath):
+        print("Downloading checkpoints from Google Drive... tips: If you cannot see the progress bar, please try to download it manuall \
+              and put it in the checkpointes directory. E2FGVI-HQ-CVPR22.pth: https://github.com/MCG-NKU/E2FGVI(E2FGVI-HQ model)")
+        url = f"https://drive.google.com/uc?id={file_id}"
+        gdown.download(url, filepath, quiet=False)
+        print("Downloaded successfully!")
+    return filepath
 # convert points input to prompt state
 def get_prompt(click_state, click_input):
     inputs = json.loads(click_input)
                 break
     except (OSError, TypeError, ValueError, KeyError, SyntaxError) as e:
         print("read_frame_source:{} error. {}\n".format(video_path, str(e)))
+    image_size = (frames[0].shape[0],frames[0].shape[1])
     # initialize video_state
     video_state = {
         "video_name": os.path.split(video_path)[-1],
         "origin_images": frames,
         "painted_images": frames.copy(),
+        "masks": [np.zeros((frames[0].shape[0],frames[0].shape[1]), np.uint8)]*len(frames),
         "logits": [None]*len(frames),
         "select_frame_number": 0,
         "fps": fps
         }
+    video_info = "Video Name: {}, FPS: {}, Total Frames: {}, Image Size:{}".format(video_state["video_name"], video_state["fps"], len(frames), image_size)
     model.samcontroler.sam_controler.reset_image()
     model.samcontroler.sam_controler.set_image(video_state["origin_images"][0])
                         gr.update(visible=True), gr.update(visible=True), \
                         gr.update(visible=True), gr.update(visible=True), \
                         gr.update(visible=True), gr.update(visible=True), \
+                        gr.update(visible=True), gr.update(visible=True)
+def run_example(example):
+    return video_input
 # get the select frame from gradio slider
 def select_template(image_selection_slider, video_state, interactive_state):
     model.samcontroler.sam_controler.reset_image()
     model.samcontroler.sam_controler.set_image(video_state["origin_images"][image_selection_slider])
+    # update the masks when select a new template frame
+    # if video_state["masks"][image_selection_slider] is not None:
+        # video_state["painted_images"][image_selection_slider] = mask_painter(video_state["origin_images"][image_selection_slider], video_state["masks"][image_selection_slider])
     return video_state["painted_images"][image_selection_slider], video_state, interactive_state
+# set the tracking end frame
+def get_end_number(track_pause_number_slider, video_state, interactive_state):
     interactive_state["track_end_number"] = track_pause_number_slider
+    return video_state["painted_images"][track_pause_number_slider],interactive_state
+def get_resize_ratio(resize_ratio_slider, interactive_state):
+    interactive_state["resize_ratio"] = resize_ratio_slider
     return interactive_state
 # use sam to get the mask
         video_state["logits"][video_state["select_frame_number"]:] = logits
         video_state["painted_images"][video_state["select_frame_number"]:] = painted_images
+    video_output = generate_video_from_frames(video_state["painted_images"], output_path="./result/track/{}".format(video_state["video_name"]), fps=fps) # import video_input to name the output video
     interactive_state["inference_times"] += 1
     print("For generating this tracking result, inference times: {}, click times: {}, positive: {}, negative: {}".format(interactive_state["inference_times"],
     #### shanggao code for mask save
     return video_output, video_state, interactive_state
+# extracting masks from mask_dropdown
+# def extract_sole_mask(video_state, mask_dropdown):
+#     combined_masks =
+#     unique_masks = np.unique(combined_masks)
+#     return 0
+# inpaint
+def inpaint_video(video_state, interactive_state, mask_dropdown):
+    frames = np.asarray(video_state["origin_images"])
+    fps = video_state["fps"]
+    inpaint_masks = np.asarray(video_state["masks"])
+    if len(mask_dropdown) == 0:
+        mask_dropdown = ["mask_001"]
+    mask_dropdown.sort()
+    # convert mask_dropdown to mask numbers
+    inpaint_mask_numbers = [int(mask_dropdown[i].split("_")[1]) for i in range(len(mask_dropdown))]
+    # interate through all masks and remove the masks that are not in mask_dropdown
+    unique_masks = np.unique(inpaint_masks)
+    num_masks = len(unique_masks) - 1
+    for i in range(1, num_masks + 1):
+        if i in inpaint_mask_numbers:
+            continue
+        inpaint_masks[inpaint_masks==i] = 0
+    # inpaint for videos
+    inpainted_frames = model.baseinpainter.inpaint(frames, inpaint_masks, ratio=interactive_state["resize_ratio"])   # numpy array, T, H, W, 3
+    video_output = generate_video_from_frames(inpainted_frames, output_path="./result/inpaint/{}".format(video_state["video_name"]), fps=fps) # import video_input to name the output video
+    return video_output
 # generate video after vos inference
 def generate_video_from_frames(frames, output_path, fps=30):
     """
 sam_checkpoint_url = "https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth"
 xmem_checkpoint = "XMem-s012.pth"
 xmem_checkpoint_url = "https://github.com/hkchengrex/XMem/releases/download/v1.0/XMem-s012.pth"
+e2fgvi_checkpoint = "E2FGVI-HQ-CVPR22.pth"
+e2fgvi_checkpoint_id = "10wGdKSUOie0XmCr8SQ2A2FeDe-mfn5w3"
 folder ="./checkpoints"
 SAM_checkpoint = download_checkpoint(sam_checkpoint_url, folder, SAM_checkpoint)
 xmem_checkpoint = download_checkpoint(xmem_checkpoint_url, folder, xmem_checkpoint)
+e2fgvi_checkpoint = download_checkpoint_from_google_drive(e2fgvi_checkpoint_id, folder, e2fgvi_checkpoint)
 # args, defined in track_anything.py
 args = parse_augment()
 # args.port = 12315
+# args.device = "cuda:2"
 # args.mask_save = True
+# initialize sam, xmem, e2fgvi models
+model = TrackingAnything(SAM_checkpoint, xmem_checkpoint, e2fgvi_checkpoint,args)
 with gr.Blocks() as iface:
     """
             "mask_names": [],
             "masks": []
         },
+        "track_end_number": None,
+        "resize_ratio": 1
     }
     )
         "origin_images": None,
         "painted_images": None,
         "masks": None,
+        "inpaint_masks": None,
         "logits": None,
         "select_frame_number": 0,
         "fps": 30
         with gr.Column():
             with gr.Row(scale=0.4):
                 video_input = gr.Video(autosize=True)
+                with gr.Column():
+                    video_info = gr.Textbox()
+                    video_info = gr.Textbox(value="If you want to use the inpaint function, it is best to download and use a machine with more VRAM locally. \
+                                            Alternatively, you can use the resize ratio slider to scale down the original image to around 360P resolution for faster processing.")
+                    resize_ratio_slider = gr.Slider(minimum=0.02, maximum=1, step=0.02, value=1, label="Resize ratio", visible=True)
             with gr.Row():
                     mask_dropdown = gr.Dropdown(multiselect=True, value=[], label="Mask_select", info=".", visible=False)
                     remove_mask_button = gr.Button(value="Remove mask", interactive=True, visible=False)
                     video_output = gr.Video(autosize=True, visible=False).style(height=360)
+                    with gr.Row():
+                        tracking_video_predict_button = gr.Button(value="Tracking", visible=False)
+                        inpaint_video_predict_button = gr.Button(value="Inpaint", visible=False)
     # first step: get the video information
     extract_frames_button.click(
         ],
         outputs=[video_state, video_info, template_frame,
                  image_selection_slider, track_pause_number_slider,point_prompt, click_mode, clear_button_click, Add_mask_button, template_frame,
+                 tracking_video_predict_button, video_output, mask_dropdown, remove_mask_button, inpaint_video_predict_button]
     )
     # second step: select images from slider
                                    inputs=[image_selection_slider, video_state, interactive_state],
                                    outputs=[template_frame, video_state, interactive_state], api_name="select_image")
     track_pause_number_slider.release(fn=get_end_number,
+                                   inputs=[track_pause_number_slider, video_state, interactive_state],
+                                   outputs=[template_frame, interactive_state], api_name="end_image")
+    resize_ratio_slider.release(fn=get_resize_ratio,
+                                   inputs=[resize_ratio_slider, interactive_state],
+                                   outputs=[interactive_state], api_name="resize_ratio")
     # click select image to get mask using sam
     template_frame.select(
         outputs=[video_output, video_state, interactive_state]
     )
+    # inpaint video from select image and mask
+    inpaint_video_predict_button.click(
+        fn=inpaint_video,
+        inputs=[video_state, interactive_state, mask_dropdown],
+        outputs=[video_output]
+    )
     # click to get mask
     mask_dropdown.change(
         fn=show_mask,
         "origin_images": None,
         "painted_images": None,
         "masks": None,
+        "inpaint_masks": None,
         "logits": None,
         "select_frame_number": 0,
         "fps": 30
             "mask_names": [],
             "masks": []
         },
+        "track_end_number": 0,
+        "resize_ratio": 1
         },
         [[],[]],
         None,
         None,
         gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), \
         gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), \
+        gr.update(visible=False), gr.update(visible=False), gr.update(visible=False, value=[]), gr.update(visible=False), gr.update(visible=False) \
         ),
         [],
             video_output,
             template_frame,
             tracking_video_predict_button, image_selection_slider , track_pause_number_slider,point_prompt, click_mode, clear_button_click,
+            Add_mask_button, template_frame, tracking_video_predict_button, video_output, mask_dropdown, remove_mask_button,inpaint_video_predict_button
         ],
         queue=False,
         show_progress=False)
         fn = clear_click,
         inputs = [video_state, click_state,],
         outputs = [template_frame,click_state],
+    )
+    # set example
+    gr.Markdown("##  Examples")
+    gr.Examples(
+        examples=[os.path.join(os.path.dirname(__file__), "./test_sample/", test_sample) for test_sample in ["test-sample8.mp4","test-sample4.mp4", \
+                                                                                                             "test-sample2.mp4","test-sample13.mp4"]],
+        fn=run_example,
+        inputs=[
+            video_input
+        ],
+        outputs=[video_input],
+        # cache_examples=True,
     )
 iface.queue(concurrency_count=1)
+iface.launch(debug=True, enable_queue=True, server_port=args.port, server_name="0.0.0.0")

assets/avengers.gif ADDED Viewed

Git LFS Details

SHA256: 9193a028c2e968ff7a7ee222ccc27166a5fbbe40a4d971cee13eba519134c5cf
Pointer size: 133 Bytes
Size of remote file: 99.2 MB

assets/track-anything-logo.jpg ADDED Viewed

checkpoints/E2FGVI-HQ-CVPR22.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:afff989d41205598a79ce24630b9c83af4b0a06f45b137979a25937d94c121a5
+size 164535938

inpainter/.DS_Store CHANGED Viewed

Binary files a/inpainter/.DS_Store and b/inpainter/.DS_Store differ

inpainter/base_inpainter.py CHANGED Viewed

@@ -7,7 +7,7 @@ import yaml
 import cv2
 import importlib
 import numpy as np
-from util.tensor_util import resize_frames, resize_masks
 class BaseInpainter:
@@ -15,7 +15,7 @@ class BaseInpainter:
         """
         E2FGVI_checkpoint: checkpoint of inpainter (version hq, with multi-resolution support)
         """
-        net = importlib.import_module('model.e2fgvi_hq')
         self.model = net.InpaintGenerator().to(device)
         self.model.load_state_dict(torch.load(E2FGVI_checkpoint, map_location=device))
         self.model.eval()
@@ -67,6 +67,10 @@ class BaseInpainter:
             size = None
         else:
             size = (int(W*ratio), int(H*ratio))
         masks = np.expand_dims(masks, axis=3)    # expand to T, H, W, 1
         binary_masks = resize_masks(masks, size)

 import cv2
 import importlib
 import numpy as np
+from inpainter.util.tensor_util import resize_frames, resize_masks
 class BaseInpainter:
         """
         E2FGVI_checkpoint: checkpoint of inpainter (version hq, with multi-resolution support)
         """
+        net = importlib.import_module('inpainter.model.e2fgvi_hq')
         self.model = net.InpaintGenerator().to(device)
         self.model.load_state_dict(torch.load(E2FGVI_checkpoint, map_location=device))
         self.model.eval()
             size = None
         else:
             size = (int(W*ratio), int(H*ratio))
+            if size[0] % 2 > 0:
+                size[0] += 1
+            if size[1] % 2 > 0:
+                size[1] += 1
         masks = np.expand_dims(masks, axis=3)    # expand to T, H, W, 1
         binary_masks = resize_masks(masks, size)

inpainter/model/e2fgvi_hq.py CHANGED Viewed

@@ -5,10 +5,10 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from model.modules.flow_comp import SPyNet
-from model.modules.feat_prop import BidirectionalPropagation, SecondOrderDeformableAlignment
-from model.modules.tfocal_transformer_hq import TemporalFocalTransformerBlock, SoftSplit, SoftComp
-from model.modules.spectral_norm import spectral_norm as _spectral_norm
 class BaseNetwork(nn.Module):

 import torch.nn as nn
 import torch.nn.functional as F
+from inpainter.model.modules.flow_comp import SPyNet
+from inpainter.model.modules.feat_prop import BidirectionalPropagation, SecondOrderDeformableAlignment
+from inpainter.model.modules.tfocal_transformer_hq import TemporalFocalTransformerBlock, SoftSplit, SoftComp
+from inpainter.model.modules.spectral_norm import spectral_norm as _spectral_norm
 class BaseNetwork(nn.Module):

inpainter/model/modules/feat_prop.py CHANGED Viewed

@@ -7,7 +7,7 @@ import torch.nn as nn
 from mmcv.ops import ModulatedDeformConv2d, modulated_deform_conv2d
 from mmengine.model import constant_init
-from model.modules.flow_comp import flow_warp
 class SecondOrderDeformableAlignment(ModulatedDeformConv2d):

 from mmcv.ops import ModulatedDeformConv2d, modulated_deform_conv2d
 from mmengine.model import constant_init
+from inpainter.model.modules.flow_comp import flow_warp
 class SecondOrderDeformableAlignment(ModulatedDeformConv2d):

overleaf/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

overleaf/Track Anything.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d271378ac9538e322b362b43a41e2c22a21cffac6f539a0c3e5b140c3b24b47e
+size 5370701

overleaf/Track Anything/figs/avengers_1.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a519eb00a2d315ecdc36b5a53e174e9b3361a9526c7fcd8a96bfefde2eeb940f
+size 2570569

overleaf/Track Anything/figs/davisresults.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fefd74df3daafd48ffb72a725c43354712a244db70e6c5d7ae8773203e0be492
+size 1349133

overleaf/Track Anything/figs/failedcases.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ccb662ff62914d05fe8dc99640b9f89b32847675dd2069900a27771569378aa4
+size 1200242

overleaf/Track Anything/figs/overview_4.pdf ADDED Viewed

Binary file (424 kB). View file

overleaf/Track Anything/neurips_2022.bbl ADDED Viewed

	@@ -0,0 +1,105 @@

+\begin{thebibliography}{10}
+\bibitem{xmem}
+Ho~Kei Cheng and Alexander~G. Schwing.
+\newblock Xmem: Long-term video object segmentation with an atkinson-shiffrin
+  memory model.
+\newblock In {\em {ECCV} {(28)}}, volume 13688 of {\em Lecture Notes in
+  Computer Science}, pages 640--658. Springer, 2022.
+\bibitem{mivos}
+Ho~Kei Cheng, Yu{-}Wing Tai, and Chi{-}Keung Tang.
+\newblock Modular interactive video object segmentation: Interaction-to-mask,
+  propagation and difference-aware fusion.
+\newblock In {\em {CVPR}}, pages 5559--5568. Computer Vision Foundation /
+  {IEEE}, 2021.
+\bibitem{vit}
+Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn,
+  Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg
+  Heigold, Sylvain Gelly, Jakob Uszkoreit, and Neil Houlsby.
+\newblock An image is worth 16x16 words: Transformers for image recognition at
+  scale.
+\newblock In {\em {ICLR}}. OpenReview.net, 2021.
+\bibitem{vos}
+Mingqi Gao, Feng Zheng, James J.~Q. Yu, Caifeng Shan, Guiguang Ding, and
+  Jungong Han.
+\newblock Deep learning for video object segmentation: a review.
+\newblock {\em Artif. Intell. Rev.}, 56(1):457--531, 2023.
+\bibitem{sam}
+Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura
+  Gustafson, Tete Xiao, Spencer Whitehead, Alexander~C Berg, Wan-Yen Lo, et~al.
+\newblock Segment anything.
+\newblock {\em arXiv preprint arXiv:2304.02643}, 2023.
+\bibitem{vot10}
+Matej Kristan, Ale{\v{s}} Leonardis, Ji{\v{r}}{\'\i} Matas, Michael Felsberg,
+  Roman Pflugfelder, Joni-Kristian K{\"a}m{\"a}r{\"a}inen, Hyung~Jin Chang,
+  Martin Danelljan, Luka~{\v{C}}ehovin Zajc, Alan Luke{\v{z}}i{\v{c}}, et~al.
+\newblock The tenth visual object tracking vot2022 challenge results.
+\newblock In {\em Computer Vision--ECCV 2022 Workshops: Tel Aviv, Israel,
+  October 23--27, 2022, Proceedings, Part VIII}, pages 431--460. Springer,
+  2023.
+\bibitem{vot8}
+Matej Kristan, Ale{\v{s}} Leonardis, Ji{\v{r}}{\'\i} Matas, Michael Felsberg,
+  Roman Pflugfelder, Joni-Kristian K{\"a}m{\"a}r{\"a}inen, Martin Danelljan,
+  Luka~{\v{C}}ehovin Zajc, Alan Luke{\v{z}}i{\v{c}}, Ondrej Drbohlav, et~al.
+\newblock The eighth visual object tracking vot2020 challenge results.
+\newblock In {\em European Conference on Computer Vision}, pages 547--601.
+  Springer, 2020.
+\bibitem{vot6}
+Matej Kristan, Ales Leonardis, Jiri Matas, Michael Felsberg, Roman Pflugfelder,
+  Luka ˇCehovin~Zajc, Tomas Vojir, Goutam Bhat, Alan Lukezic, Abdelrahman
+  Eldesokey, et~al.
+\newblock The sixth visual object tracking vot2018 challenge results.
+\newblock In {\em Proceedings of the European Conference on Computer Vision
+  (ECCV) Workshops}, pages 0--0, 2018.
+\bibitem{vot9}
+Matej Kristan, Ji{\v{r}}{\'\i} Matas, Ale{\v{s}} Leonardis, Michael Felsberg,
+  Roman Pflugfelder, Joni-Kristian K{\"a}m{\"a}r{\"a}inen, Hyung~Jin Chang,
+  Martin Danelljan, Luka Cehovin, Alan Luke{\v{z}}i{\v{c}}, et~al.
+\newblock The ninth visual object tracking vot2021 challenge results.
+\newblock In {\em Proceedings of the IEEE/CVF International Conference on
+  Computer Vision}, pages 2711--2738, 2021.
+\bibitem{vot7}
+Matej Kristan, Jiri Matas, Ales Leonardis, Michael Felsberg, Roman Pflugfelder,
+  Joni-Kristian Kamarainen, Luka ˇCehovin~Zajc, Ondrej Drbohlav, Alan Lukezic,
+  Amanda Berg, et~al.
+\newblock The seventh visual object tracking vot2019 challenge results.
+\newblock In {\em Proceedings of the IEEE/CVF International Conference on
+  Computer Vision Workshops}, pages 0--0, 2019.
+\bibitem{e2fgvi}
+Zhen Li, Chengze Lu, Jianhua Qin, Chun{-}Le Guo, and Ming{-}Ming Cheng.
+\newblock Towards an end-to-end framework for flow-guided video inpainting.
+\newblock In {\em {CVPR}}, pages 17541--17550. {IEEE}, 2022.
+\bibitem{stm}
+Seoung~Wug Oh, Joon{-}Young Lee, Ning Xu, and Seon~Joo Kim.
+\newblock Video object segmentation using space-time memory networks.
+\newblock In {\em {ICCV}}, pages 9225--9234. {IEEE}, 2019.
+\bibitem{davis}
+Jordi Pont{-}Tuset, Federico Perazzi, Sergi Caelles, Pablo Arbelaez, Alexander
+  Sorkine{-}Hornung, and Luc~Van Gool.
+\newblock The 2017 {DAVIS} challenge on video object segmentation.
+\newblock {\em CoRR}, abs/1704.00675, 2017.
+\bibitem{siammask}
+Qiang Wang, Li~Zhang, Luca Bertinetto, Weiming Hu, and Philip H.~S. Torr.
+\newblock Fast online object tracking and segmentation: {A} unifying approach.
+\newblock In {\em {CVPR}}, pages 1328--1338. Computer Vision Foundation /
+  {IEEE}, 2019.
+\bibitem{aot}
+Zongxin Yang, Yunchao Wei, and Yi~Yang.
+\newblock Associating objects with transformers for video object segmentation.
+\newblock In {\em NeurIPS}, pages 2491--2502, 2021.
+\end{thebibliography}

overleaf/Track Anything/neurips_2022.bib ADDED Viewed

	@@ -0,0 +1,187 @@

+@article{sam,
+  title={Segment anything},
+  author={Kirillov, Alexander and Mintun, Eric and Ravi, Nikhila and Mao, Hanzi and Rolland, Chloe and Gustafson, Laura and Xiao, Tete and Whitehead, Spencer and Berg, Alexander C and Lo, Wan-Yen and others},
+  journal={arXiv preprint arXiv:2304.02643},
+  year={2023}
+}
+@inproceedings{xmem,
+  author    = {Ho Kei Cheng and
+               Alexander G. Schwing},
+  title     = {XMem: Long-Term Video Object Segmentation with an Atkinson-Shiffrin
+               Memory Model},
+  booktitle = {{ECCV} {(28)}},
+  series    = {Lecture Notes in Computer Science},
+  volume    = {13688},
+  pages     = {640--658},
+  publisher = {Springer},
+  year      = {2022}
+}
+%related
+@article{vos,
+  author    = {Mingqi Gao and
+               Feng Zheng and
+               James J. Q. Yu and
+               Caifeng Shan and
+               Guiguang Ding and
+               Jungong Han},
+  title     = {Deep learning for video object segmentation: a review},
+  journal   = {Artif. Intell. Rev.},
+  volume    = {56},
+  number    = {1},
+  pages     = {457--531},
+  year      = {2023}
+}
+@inproceedings{vot9,
+  title={The ninth visual object tracking vot2021 challenge results},
+  author={Kristan, Matej and Matas, Ji{\v{r}}{\'\i} and Leonardis, Ale{\v{s}} and Felsberg, Michael and Pflugfelder, Roman and K{\"a}m{\"a}r{\"a}inen, Joni-Kristian and Chang, Hyung Jin and Danelljan, Martin and Cehovin, Luka and Luke{\v{z}}i{\v{c}}, Alan and others},
+  booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},
+  pages={2711--2738},
+  year={2021}
+}
+@inproceedings{vot10,
+  title={The Tenth Visual Object Tracking VOT2022 Challenge Results},
+  author={Kristan, Matej and Leonardis, Ale{\v{s}} and Matas, Ji{\v{r}}{\'\i} and Felsberg, Michael and Pflugfelder, Roman and K{\"a}m{\"a}r{\"a}inen, Joni-Kristian and Chang, Hyung Jin and Danelljan, Martin and Zajc, Luka {\v{C}}ehovin and Luke{\v{z}}i{\v{c}}, Alan and others},
+  booktitle={Computer Vision--ECCV 2022 Workshops: Tel Aviv, Israel, October 23--27, 2022, Proceedings, Part VIII},
+  pages={431--460},
+  year={2023},
+  organization={Springer}
+}
+@inproceedings{vot8,
+  title={The eighth visual object tracking VOT2020 challenge results},
+  author={Kristan, Matej and Leonardis, Ale{\v{s}} and Matas, Ji{\v{r}}{\'\i} and Felsberg, Michael and Pflugfelder, Roman and K{\"a}m{\"a}r{\"a}inen, Joni-Kristian and Danelljan, Martin and Zajc, Luka {\v{C}}ehovin and Luke{\v{z}}i{\v{c}}, Alan and Drbohlav, Ondrej and others},
+  booktitle={European Conference on Computer Vision},
+  pages={547--601},
+  year={2020},
+  organization={Springer}
+}
+@inproceedings{vot7,
+  title={The seventh visual object tracking vot2019 challenge results},
+  author={Kristan, Matej and Matas, Jiri and Leonardis, Ales and Felsberg, Michael and Pflugfelder, Roman and Kamarainen, Joni-Kristian and ˇCehovin Zajc, Luka and Drbohlav, Ondrej and Lukezic, Alan and Berg, Amanda and others},
+  booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision Workshops},
+  pages={0--0},
+  year={2019}
+}
+@inproceedings{vot6,
+  title={The sixth visual object tracking vot2018 challenge results},
+  author={Kristan, Matej and Leonardis, Ales and Matas, Jiri and Felsberg, Michael and Pflugfelder, Roman and ˇCehovin Zajc, Luka and Vojir, Tomas and Bhat, Goutam and Lukezic, Alan and Eldesokey, Abdelrahman and others},
+  booktitle={Proceedings of the European Conference on Computer Vision (ECCV) Workshops},
+  pages={0--0},
+  year={2018}
+}
+@inproceedings{vit,
+  author       = {Alexey Dosovitskiy and
+                  Lucas Beyer and
+                  Alexander Kolesnikov and
+                  Dirk Weissenborn and
+                  Xiaohua Zhai and
+                  Thomas Unterthiner and
+                  Mostafa Dehghani and
+                  Matthias Minderer and
+                  Georg Heigold and
+                  Sylvain Gelly and
+                  Jakob Uszkoreit and
+                  Neil Houlsby},
+  title        = {An Image is Worth 16x16 Words: Transformers for Image Recognition
+                  at Scale},
+  booktitle    = {{ICLR}},
+  publisher    = {OpenReview.net},
+  year         = {2021}
+}
+@inproceedings{stm,
+  author       = {Seoung Wug Oh and
+                  Joon{-}Young Lee and
+                  Ning Xu and
+                  Seon Joo Kim},
+  title        = {Video Object Segmentation Using Space-Time Memory Networks},
+  booktitle    = {{ICCV}},
+  pages        = {9225--9234},
+  publisher    = {{IEEE}},
+  year         = {2019}
+}
+@inproceedings{siammask,
+  author       = {Qiang Wang and
+                  Li Zhang and
+                  Luca Bertinetto and
+                  Weiming Hu and
+                  Philip H. S. Torr},
+  title        = {Fast Online Object Tracking and Segmentation: {A} Unifying Approach},
+  booktitle    = {{CVPR}},
+  pages        = {1328--1338},
+  publisher    = {Computer Vision Foundation / {IEEE}},
+  year         = {2019}
+}
+@inproceedings{mivos,
+  author       = {Ho Kei Cheng and
+                  Yu{-}Wing Tai and
+                  Chi{-}Keung Tang},
+  title        = {Modular Interactive Video Object Segmentation: Interaction-to-Mask,
+                  Propagation and Difference-Aware Fusion},
+  booktitle    = {{CVPR}},
+  pages        = {5559--5568},
+  publisher    = {Computer Vision Foundation / {IEEE}},
+  year         = {2021}
+}
+@article{davis,
+  author    = {Jordi Pont{-}Tuset and
+               Federico Perazzi and
+               Sergi Caelles and
+               Pablo Arbelaez and
+               Alexander Sorkine{-}Hornung and
+               Luc Van Gool},
+  title     = {The 2017 {DAVIS} Challenge on Video Object Segmentation},
+  journal   = {CoRR},
+  volume    = {abs/1704.00675},
+  year      = {2017}
+}
+@inproceedings{aot,
+  author    = {Zongxin Yang and
+               Yunchao Wei and
+               Yi Yang},
+  title     = {Associating Objects with Transformers for Video Object Segmentation},
+  booktitle = {NeurIPS},
+  pages     = {2491--2502},
+  year      = {2021}
+}
+@inproceedings{icip,
+  author       = {St{\'{e}}phane Vujasinovic and
+                  Sebastian Bullinger and
+                  Stefan Becker and
+                  Norbert Scherer{-}Negenborn and
+                  Michael Arens and
+                  Rainer Stiefelhagen},
+  title        = {Revisiting Click-Based Interactive Video Object Segmentation},
+  booktitle    = {{ICIP}},
+  pages        = {2756--2760},
+  publisher    = {{IEEE}},
+  year         = {2022}
+}
+@inproceedings{e2fgvi,
+  author       = {Zhen Li and
+                  Chengze Lu and
+                  Jianhua Qin and
+                  Chun{-}Le Guo and
+                  Ming{-}Ming Cheng},
+  title        = {Towards An End-to-End Framework for Flow-Guided Video Inpainting},
+  booktitle    = {{CVPR}},
+  pages        = {17541--17550},
+  publisher    = {{IEEE}},
+  year         = {2022}
+}

overleaf/Track Anything/neurips_2022.sty ADDED Viewed

	@@ -0,0 +1,381 @@

+% partial rewrite of the LaTeX2e package for submissions to the
+% Conference on Neural Information Processing Systems (NeurIPS):
+%
+% - uses more LaTeX conventions
+% - line numbers at submission time replaced with aligned numbers from
+%   lineno package
+% - \nipsfinalcopy replaced with [final] package option
+% - automatically loads times package for authors
+% - loads natbib automatically; this can be suppressed with the
+%   [nonatbib] package option
+% - adds foot line to first page identifying the conference
+% - adds preprint option for submission to e.g. arXiv
+% - conference acronym modified
+%
+% Roman Garnett (garnett@wustl.edu) and the many authors of
+% nips15submit_e.sty, including MK and drstrip@sandia
+%
+% last revision: March 2022
+\NeedsTeXFormat{LaTeX2e}
+\ProvidesPackage{neurips_2022}[2022/03/31 NeurIPS 2022 submission/camera-ready style file]
+% declare final option, which creates camera-ready copy
+\newif\if@neuripsfinal\@neuripsfinalfalse
+\DeclareOption{final}{
+  \@neuripsfinaltrue
+}
+% declare nonatbib option, which does not load natbib in case of
+% package clash (users can pass options to natbib via
+% \PassOptionsToPackage)
+\newif\if@natbib\@natbibtrue
+\DeclareOption{nonatbib}{
+  \@natbibfalse
+}
+% declare preprint option, which creates a preprint version ready for
+% upload to, e.g., arXiv
+\newif\if@preprint\@preprintfalse
+\DeclareOption{preprint}{
+  \@preprinttrue
+}
+\ProcessOptions\relax
+% determine whether this is an anonymized submission
+\newif\if@submission\@submissiontrue
+\if@neuripsfinal\@submissionfalse\fi
+\if@preprint\@submissionfalse\fi
+% fonts
+\renewcommand{\rmdefault}{ptm}
+\renewcommand{\sfdefault}{phv}
+% change this every year for notice string at bottom
+\newcommand{\@neuripsordinal}{36th}
+\newcommand{\@neuripsyear}{2022}
+\newcommand{\@neuripslocation}{New Orleans}
+% acknowledgments
+\usepackage{environ}
+\newcommand{\acksection}{\section*{Acknowledgments and Disclosure of Funding}}
+\NewEnviron{ack}{%
+  \acksection
+  \BODY
+}
+% load natbib unless told otherwise
+%\if@natbib
+%  \RequirePackage{natbib}
+%\fi
+% set page geometry
+\usepackage[verbose=true,letterpaper]{geometry}
+\AtBeginDocument{
+  \newgeometry{
+    textheight=9in,
+    textwidth=5.5in,
+    top=1in,
+    headheight=12pt,
+    headsep=25pt,
+    footskip=30pt
+  }
+  \@ifpackageloaded{fullpage}
+    {\PackageWarning{neurips_2022}{fullpage package not allowed! Overwriting formatting.}}
+    {}
+}
+\widowpenalty=10000
+\clubpenalty=10000
+\flushbottom
+\sloppy
+% font sizes with reduced leading
+\renewcommand{\normalsize}{%
+  \@setfontsize\normalsize\@xpt\@xipt
+  \abovedisplayskip      7\p@ \@plus 2\p@ \@minus 5\p@
+  \abovedisplayshortskip \z@ \@plus 3\p@
+  \belowdisplayskip      \abovedisplayskip
+  \belowdisplayshortskip 4\p@ \@plus 3\p@ \@minus 3\p@
+}
+\normalsize
+\renewcommand{\small}{%
+  \@setfontsize\small\@ixpt\@xpt
+  \abovedisplayskip      6\p@ \@plus 1.5\p@ \@minus 4\p@
+  \abovedisplayshortskip \z@  \@plus 2\p@
+  \belowdisplayskip      \abovedisplayskip
+  \belowdisplayshortskip 3\p@ \@plus 2\p@   \@minus 2\p@
+}
+\renewcommand{\footnotesize}{\@setfontsize\footnotesize\@ixpt\@xpt}
+\renewcommand{\scriptsize}{\@setfontsize\scriptsize\@viipt\@viiipt}
+\renewcommand{\tiny}{\@setfontsize\tiny\@vipt\@viipt}
+\renewcommand{\large}{\@setfontsize\large\@xiipt{14}}
+\renewcommand{\Large}{\@setfontsize\Large\@xivpt{16}}
+\renewcommand{\LARGE}{\@setfontsize\LARGE\@xviipt{20}}
+\renewcommand{\huge}{\@setfontsize\huge\@xxpt{23}}
+\renewcommand{\Huge}{\@setfontsize\Huge\@xxvpt{28}}
+% sections with less space
+\providecommand{\section}{}
+\renewcommand{\section}{%
+  \@startsection{section}{1}{\z@}%
+                {-2.0ex \@plus -0.5ex \@minus -0.2ex}%
+                { 1.5ex \@plus  0.3ex \@minus  0.2ex}%
+                {\large\bf\raggedright}%
+}
+\providecommand{\subsection}{}
+\renewcommand{\subsection}{%
+  \@startsection{subsection}{2}{\z@}%
+                {-1.8ex \@plus -0.5ex \@minus -0.2ex}%
+                { 0.8ex \@plus  0.2ex}%
+                {\normalsize\bf\raggedright}%
+}
+\providecommand{\subsubsection}{}
+\renewcommand{\subsubsection}{%
+  \@startsection{subsubsection}{3}{\z@}%
+                {-1.5ex \@plus -0.5ex \@minus -0.2ex}%
+                { 0.5ex \@plus  0.2ex}%
+                {\normalsize\bf\raggedright}%
+}
+\providecommand{\paragraph}{}
+\renewcommand{\paragraph}{%
+  \@startsection{paragraph}{4}{\z@}%
+                {1.5ex \@plus 0.5ex \@minus 0.2ex}%
+                {-1em}%
+                {\normalsize\bf}%
+}
+\providecommand{\subparagraph}{}
+\renewcommand{\subparagraph}{%
+  \@startsection{subparagraph}{5}{\z@}%
+                {1.5ex \@plus 0.5ex \@minus 0.2ex}%
+                {-1em}%
+                {\normalsize\bf}%
+}
+\providecommand{\subsubsubsection}{}
+\renewcommand{\subsubsubsection}{%
+  \vskip5pt{\noindent\normalsize\rm\raggedright}%
+}
+% float placement
+\renewcommand{\topfraction      }{0.85}
+\renewcommand{\bottomfraction   }{0.4}
+\renewcommand{\textfraction     }{0.1}
+\renewcommand{\floatpagefraction}{0.7}
+\newlength{\@neuripsabovecaptionskip}\setlength{\@neuripsabovecaptionskip}{7\p@}
+\newlength{\@neuripsbelowcaptionskip}\setlength{\@neuripsbelowcaptionskip}{\z@}
+\setlength{\abovecaptionskip}{\@neuripsabovecaptionskip}
+\setlength{\belowcaptionskip}{\@neuripsbelowcaptionskip}
+% swap above/belowcaptionskip lengths for tables
+\renewenvironment{table}
+  {\setlength{\abovecaptionskip}{\@neuripsbelowcaptionskip}%
+   \setlength{\belowcaptionskip}{\@neuripsabovecaptionskip}%
+   \@float{table}}
+  {\end@float}
+% footnote formatting
+\setlength{\footnotesep }{6.65\p@}
+\setlength{\skip\footins}{9\p@ \@plus 4\p@ \@minus 2\p@}
+\renewcommand{\footnoterule}{\kern-3\p@ \hrule width 12pc \kern 2.6\p@}
+\setcounter{footnote}{0}
+% paragraph formatting
+\setlength{\parindent}{\z@}
+\setlength{\parskip  }{5.5\p@}
+% list formatting
+\setlength{\topsep       }{4\p@ \@plus 1\p@   \@minus 2\p@}
+\setlength{\partopsep    }{1\p@ \@plus 0.5\p@ \@minus 0.5\p@}
+\setlength{\itemsep      }{2\p@ \@plus 1\p@   \@minus 0.5\p@}
+\setlength{\parsep       }{2\p@ \@plus 1\p@   \@minus 0.5\p@}
+\setlength{\leftmargin   }{3pc}
+\setlength{\leftmargini  }{\leftmargin}
+\setlength{\leftmarginii }{2em}
+\setlength{\leftmarginiii}{1.5em}
+\setlength{\leftmarginiv }{1.0em}
+\setlength{\leftmarginv  }{0.5em}
+\def\@listi  {\leftmargin\leftmargini}
+\def\@listii {\leftmargin\leftmarginii
+              \labelwidth\leftmarginii
+              \advance\labelwidth-\labelsep
+              \topsep  2\p@ \@plus 1\p@    \@minus 0.5\p@
+              \parsep  1\p@ \@plus 0.5\p@ \@minus 0.5\p@
+              \itemsep \parsep}
+\def\@listiii{\leftmargin\leftmarginiii
+              \labelwidth\leftmarginiii
+              \advance\labelwidth-\labelsep
+              \topsep    1\p@ \@plus 0.5\p@ \@minus 0.5\p@
+              \parsep    \z@
+              \partopsep 0.5\p@ \@plus 0\p@ \@minus 0.5\p@
+              \itemsep \topsep}
+\def\@listiv {\leftmargin\leftmarginiv
+              \labelwidth\leftmarginiv
+              \advance\labelwidth-\labelsep}
+\def\@listv  {\leftmargin\leftmarginv
+              \labelwidth\leftmarginv
+              \advance\labelwidth-\labelsep}
+\def\@listvi {\leftmargin\leftmarginvi
+              \labelwidth\leftmarginvi
+              \advance\labelwidth-\labelsep}
+% create title
+\providecommand{\maketitle}{}
+\renewcommand{\maketitle}{%
+  \par
+  \begingroup
+    \renewcommand{\thefootnote}{\fnsymbol{footnote}}
+    % for perfect author name centering
+    \renewcommand{\@makefnmark}{\hbox to \z@{$^{\@thefnmark}$\hss}}
+    % The footnote-mark was overlapping the footnote-text,
+    % added the following to fix this problem               (MK)
+    \long\def\@makefntext##1{%
+      \parindent 1em\noindent
+      \hbox to 1.8em{\hss $\m@th ^{\@thefnmark}$}##1
+    }
+    \thispagestyle{empty}
+    \@maketitle
+    \@thanks
+    \@notice
+  \endgroup
+  \let\maketitle\relax
+  \let\thanks\relax
+}
+% rules for title box at top of first page
+\newcommand{\@toptitlebar}{
+  \hrule height 4\p@
+  \vskip 0.25in
+  \vskip -\parskip%
+}
+\newcommand{\@bottomtitlebar}{
+  \vskip 0.29in
+  \vskip -\parskip
+  \hrule height 1\p@
+  \vskip 0.09in%
+}
+% create title (includes both anonymized and non-anonymized versions)
+\providecommand{\@maketitle}{}
+\renewcommand{\@maketitle}{%
+  \vbox{%
+    \hsize\textwidth
+    \linewidth\hsize
+    \vskip 0.1in
+    \@toptitlebar
+    \centering
+    {\LARGE\bf \@title\par}
+    \@bottomtitlebar
+    \if@submission
+      \begin{tabular}[t]{c}\bf\rule{\z@}{24\p@}
+        Anonymous Author(s) \\
+        Affiliation \\
+        Address \\
+        \texttt{email} \\
+      \end{tabular}%
+    \else
+      \def\And{%
+        \end{tabular}\hfil\linebreak[0]\hfil%
+        \begin{tabular}[t]{c}\bf\rule{\z@}{24\p@}\ignorespaces%
+      }
+      \def\AND{%
+        \end{tabular}\hfil\linebreak[4]\hfil%
+        \begin{tabular}[t]{c}\bf\rule{\z@}{24\p@}\ignorespaces%
+      }
+      \begin{tabular}[t]{c}\bf\rule{\z@}{24\p@}\@author\end{tabular}%
+    \fi
+    \vskip 0.3in \@minus 0.1in
+  }
+}
+% add conference notice to bottom of first page
+\newcommand{\ftype@noticebox}{8}
+\newcommand{\@notice}{%
+  % give a bit of extra room back to authors on first page
+  \enlargethispage{2\baselineskip}%
+  \@float{noticebox}[b]%
+    \footnotesize\@noticestring%
+  \end@float%
+}
+% abstract styling
+\renewenvironment{abstract}%
+{%
+  \vskip 0.075in%
+  \centerline%
+  {\large\bf Abstract}%
+  \vspace{0.5ex}%
+  \begin{quote}%
+}
+{
+  \par%
+  \end{quote}%
+  \vskip 1ex%
+}
+% For the paper checklist
+\newcommand{\answerYes}[1][]{\textcolor{blue}{[Yes] #1}}
+\newcommand{\answerNo}[1][]{\textcolor{orange}{[No] #1}}
+\newcommand{\answerNA}[1][]{\textcolor{gray}{[N/A] #1}}
+\newcommand{\answerTODO}[1][]{\textcolor{red}{\bf [TODO]}}
+% handle tweaks for camera-ready copy vs. submission copy
+\if@preprint
+  \newcommand{\@noticestring}{%
+    Preprint. Under review.%
+  }
+\else
+  \if@neuripsfinal
+    \newcommand{\@noticestring}{%
+      \@neuripsordinal\/ Conference on Neural Information Processing Systems
+      (NeurIPS \@neuripsyear).%, \@neuripslocation.%
+    }
+  \else
+    \newcommand{\@noticestring}{%
+      Submitted to \@neuripsordinal\/ Conference on Neural Information
+      Processing Systems (NeurIPS \@neuripsyear). Do not distribute.%
+    }
+    % hide the acknowledgements
+    \NewEnviron{hide}{}
+    \let\ack\hide
+    \let\endack\endhide
+    % line numbers for submission
+    \RequirePackage{lineno}
+    \linenumbers
+    % fix incompatibilities between lineno and amsmath, if required, by
+    % transparently wrapping linenomath environments around amsmath
+    % environments
+    \AtBeginDocument{%
+      \@ifpackageloaded{amsmath}{%
+        \newcommand*\patchAmsMathEnvironmentForLineno[1]{%
+          \expandafter\let\csname old#1\expandafter\endcsname\csname #1\endcsname
+          \expandafter\let\csname oldend#1\expandafter\endcsname\csname end#1\endcsname
+          \renewenvironment{#1}%
+                          {\linenomath\csname old#1\endcsname}%
+                          {\csname oldend#1\endcsname\endlinenomath}%
+        }%
+        \newcommand*\patchBothAmsMathEnvironmentsForLineno[1]{%
+          \patchAmsMathEnvironmentForLineno{#1}%
+          \patchAmsMathEnvironmentForLineno{#1*}%
+        }%
+        \patchBothAmsMathEnvironmentsForLineno{equation}%
+        \patchBothAmsMathEnvironmentsForLineno{align}%
+        \patchBothAmsMathEnvironmentsForLineno{flalign}%
+        \patchBothAmsMathEnvironmentsForLineno{alignat}%
+        \patchBothAmsMathEnvironmentsForLineno{gather}%
+        \patchBothAmsMathEnvironmentsForLineno{multline}%
+      }
+      {}
+    }
+  \fi
+\fi
+\endinput

overleaf/Track Anything/neurips_2022.tex ADDED Viewed

	@@ -0,0 +1,378 @@

+\documentclass{article}
+% if you need to pass options to natbib, use, e.g.:
+%     \PassOptionsToPackage{numbers, compress}{natbib}
+% before loading neurips_2022
+% ready for submission
+% \usepackage{neurips_2022}
+% to compile a preprint version, e.g., for submission to arXiv, add add the
+% [preprint] option:
+     \usepackage[preprint]{neurips_2022}
+% to compile a camera-ready version, add the [final] option, e.g.:
+%     \usepackage[final]{neurips_2022}
+% to avoid loading the natbib package, add option nonatbib:
+%    \usepackage[nonatbib]{neurips_2022}
+\usepackage{graphicx}
+\usepackage[utf8]{inputenc} % allow utf-8 input
+\usepackage[T1]{fontenc}    % use 8-bit T1 fonts
+\usepackage{hyperref}       % hyperlinks
+\usepackage{url}            % simple URL typesetting
+\usepackage{booktabs}       % professional-quality tables
+\usepackage{amsfonts}       % blackboard math symbols
+\usepackage{nicefrac}       % compact symbols for 1/2, etc.
+\usepackage{microtype}      % microtypography
+\usepackage{xcolor}         % colors
+% \usepackage{acmart}
+\title{Track Anything: High-performance Interactive Tracking and Segmentation}
+\title{Track Anything: High-performance Object Tracking in Videos by Interactive Masks}
+% \title{Track Anything: Interaction to Mask in Videos}
+\title{Track Anything: Segment Anything Meets Videos}
+% \author{%
+%   David S.~Hippocampus\thanks{Use footnote for providing further information
+%     about author (webpage, alternative address)---\emph{not} for acknowledging
+%     funding agencies.} \\
+%   SUSTech VIPG\\
+% \author{Jinyu Yang}
+% \authornote{equal}
+% \author{Mingqi Gao}
+% \authornotemark[1]
+\author{%
+  Jinyu Yang\thanks{Equal contribution. Alphabetical order.},\enskip Mingqi Gao\footnotemark[1],\enskip Zhe Li\footnotemark[1],\enskip Shang Gao, Fangjing Wang, Feng Zheng \\
+  SUSTech VIP Lab\\
+  % Cranberry-Lemon University\\
+  % Pittsburgh, PA 15213 \\
+  % \texttt{hippo@cs.cranberry-lemon.edu} \\
+  % \url{https://github.com/gaomingqi/Track-Anything}\\
+  % examples of more authors
+  % \And
+  % Coauthor \\
+  % Affiliation \\
+  % Address \\
+  % \texttt{email} \\
+  % \AND
+  % Coauthor \\
+  % Affiliation \\
+  % Address \\
+  % \texttt{email} \\
+  % \And
+  % Coauthor \\
+  % Affiliation \\
+  % Address \\
+  % \texttt{email} \\
+  % \And
+  % Coauthor \\
+  % Affiliation \\
+  % Address \\
+  % \texttt{email} \\
+  % \thanks{these authors contributed equally}
+}
+% \affiliation{\institution{SUSTech VIP Lab}}
+% \footnote{Equal contribution. Alphabetical order.}
+\begin{document}
+\maketitle
+\begin{abstract}
+Recently, the Segment Anything Model (SAM) gains lots of attention rapidly due to its impressive segmentation performance on images.
+Regarding its strong ability on image segmentation and high interactivity with different prompts, we found that it performs poorly on consistent segmentation in videos.
+Therefore, in this report, we propose Track Anything Model (TAM), which achieves high-performance interactive tracking and segmentation in videos.
+To be detailed, given a video sequence, only with very little human participation, \textit{i.e.}, several clicks, people can track anything they are interested in, and get satisfactory results in one-pass inference.
+Without additional training, such an interactive design performs impressively on video object tracking and segmentation.
+% superior to prior works on video object tracking and segmentation.
+All resources are available on \url{https://github.com/gaomingqi/Track-Anything}.
+We hope this work can facilitate related research.
+\end{abstract}
+\section{Introduction}
+Tracking an arbitrary object in generic scenes is important, and Video Object Tracking (VOT) is a fundamental task in computer vision.
+Similar to VOT, Video Object Segmentation (VOS) aims to separate the target (region of interest) from the background in a video sequence, which can be seen as a kind of more fine-grained object tracking.
+We notice that current state-of-the-art video trackers/segmenters are trained on large-scale manually-annotated datasets and initialized by a bounding box or a segmentation mask.
+On the one hand, the massive human labor force is hidden behind huge amounts of labeled data.
+% Recently, interactive algorithms help to liberate users from labor-expensive initialization and annotation.
+Moreover, current initialization settings, especially the semi-supervised VOS, need specific object mask groundtruth for model initialization.
+How to liberate researchers from labor-expensive annotation and initialization is much of important.
+Recently, Segment-Anything Model (SAM)~\cite{sam} has been proposed, which is a large foundation model for image segmentation.
+It supports flexible prompts and computes masks in real-time, thus allowing interactive use.
+We conclude that SAM has the following advantages that can assist interactive tracking:
+\textbf{1) Strong image segmentation ability.}
+Trained on 11 million images and 1.1 billion masks, SAM can produce high-quality masks and do zero-shot segmentation in generic scenarios.
+\textbf{2) High interactivity with different kinds of prompts. }
+With input user-friendly prompts of points, boxes, or language, SAM can give satisfactory segmentation masks on specific image areas.
+However, using SAM in videos directly did not give us an impressive performance due to its deficiency in temporal correspondence.
+On the other hand, tracking or segmenting in videos faces challenges from scale variation, target deformation, motion blur, camera motion, similar objects, and so on~\cite{vos,vot6,vot7,vot8,vot9,vot10}.
+Even the state-of-the-art models suffer from complex scenarios in the public datasets~\cite{xmem}, not to mention the real-world applications.
+Therefore, a question is considered by us:
+\textit{can we achieve high-performance tracking/segmentation in videos through the way of interaction?}
+In this technical report, we introduce our Track-Anything project, which develops an efficient toolkit for high-performance object tracking and segmentation in videos.
+With a user-friendly interface, the Track Anything Model (TAM) can track and segment any objects in a given video with only one-pass inference.
+Figure~\ref{fig:overview} shows the one-pass interactive process in the proposed TAM.
+In detail, TAM combines SAM~\cite{sam}, a large segmentation model, and XMem~\cite{xmem}, an advanced VOS model.
+As shown, we integrate them in an interactive way.
+Firstly, users can interactively initialize the SAM, \textit{i.e.}, clicking on the object, to define a target object;
+then, XMem is used to give a mask prediction of the object in the next frame according to both temporal and spatial correspondence;
+next, SAM is utilized to give a more precise mask description;
+during the tracking process, users can pause and correct as soon as they notice tracking failures.
+Our contributions can be concluded as follows:
+1) We promote the SAM applications to the video level to achieve interactive video object tracking and segmentation.
+% We combine the SAM with VOS models to achieve interactive video object tracking and segmentation.
+Rather than separately using SAM per frame, we integrate SAM into the process of temporal correspondence construction.
+2) We propose one-pass interactive tracking and segmentation for efficient annotation and a user-friendly tracking interface, which uses very small amounts of human participation to solve extreme difficulties in video object perception.
+3) Our proposed method shows superior performance and high usability in complex scenes and has many potential applications.
+% \section{Related Works}
+% \textbf{Video Object Tracking.}
+% \textbf{Video Object Segmentation.}
+\section{Track Anything Task}
+Inspired by the Segment Anything task~\cite{sam}, we propose the Track Anything task, which aims to flexible object tracking in arbitrary videos.
+Here we define that the target objects can be flexibly selected, added, or removed in any way according to the users' interests.
+Also, the video length and types can be arbitrary rather than limited to trimmed or natural videos.
+With such settings, diverse downstream tasks can be achieved, including single/multiple object tracking, short-/long-term object tracking, unsupervised VOS, semi-supervised VOS, referring VOS, interactive VOS, long-term VOS, and so on.
+\section{Methodology}
+\subsection{Preliminaries}
+\textbf{Segment Anything Model~\cite{sam}.}
+Very recently, the Segment Anything Model (SAM) has been proposed by Meta AI Research and gets numerous attention.
+As a foundation model for image segmentation, SAM is based on ViT~\cite{vit} and trained on the large-scale dataset SA-1B~\cite{sam}.
+Obviously, SAM shows promising segmentation ability on images, especially on zero-shot segmentation tasks.
+Unfortunately, SAM only shows superior performance on image segmentation, while it cannot deal with complex video segmentation.
+\textbf{XMem~\cite{xmem}.}
+Given the mask description of the target object at the first frame, XMem can track the object and generate corresponding masks in the subsequent frames.
+Inspired by the Atkinson-Shiffrin memory model, it aims to solve the difficulties in long-term videos with unified feature memory stores.
+The drawbacks of XMem are also obvious: 1) as a semi-supervised VOS model, it requires a precise mask to initialize; 2) for long videos, it is difficult for XMem to recover from tracking or segmentation failure.
+In this paper, we solve both difficulties by importing interactive tracking with SAM.
+\textbf{Interactive Video Object Segmentation.}
+Interactive VOS~\cite{mivos} takes user interactions as inputs, \textit{e.g.}, scribbles.
+Then, users can iteratively refine the segmentation results until they are satisfied with them.
+Interactive VOS gains lots of attention as it is much easier to provide scribbles than to specify every pixel for an object mask.
+However, we found that current interactive VOS methods require multiple rounds to refine the results, which impedes their efficiency in real-world applications.
+\begin{figure}[t]
+\centering
+\includegraphics[width=\linewidth]{figs/overview_4.pdf}
+\caption{Pipeline of our proposed Track Anything Model (TAM). Only within one round of inference can the TAM obtain impressive tracking and segmentation performance on the human-selected target.}
+\label{fig:overview}
+\end{figure}
+\begin{table}
+  \caption{Results on DAVIS-2016-val and DAVIS-2017-test-dev datasets~\cite{davis}.}
+  \label{davis1617}
+  \centering
+   \small
+   \setlength\tabcolsep{4pt}
+  \begin{tabular}{l|c|c|c|ccc|ccc}
+    \toprule
+    & & & &\multicolumn{3}{c|}{DAVIS-2016-val} &\multicolumn{3}{c}{DAVIS-2017-test-dev} \\
+    Method & Venue & Initialization & Evaluation& $J\&F$ & $J$ &$F$ &$J\&F$ & $J$ &$F$\\
+    \midrule
+    STM~\cite{stm} & ICCV2019 &Mask & One Pass &89.3 &88.7 &89.9 & 72.2 & 69.3 & 75.2 \\
+    AOT~\cite{aot} &NeurIPS2021 &Mask & One Pass & 91.1 & 90.1 & 92.1 &  79.6 & 75.9 & 83.3 \\
+    XMem~\cite{xmem} & NeurIPS2022 &Mask & One Pass & 92.0 &90.7 &93.2 &  81.2 & 77.6 & 84.7\\
+    \midrule
+    % SiamMask~\cite{siammask}& CVPR2019 &Box & One Pass & 69.8 &71.7 &67.8 &56.4 &54.3 &58.5 \\
+    SiamMask~\cite{siammask}& CVPR2019 &Box & One Pass & 69.8 &71.7 &67.8 &- &- &- \\
+    \midrule
+    % MiVOS~\cite{mivos} & CVPR2021 &Scribble &8 Rounds &91.0 &89.6 &92.4 & 84.5 &81.7 &87.4\\
+    MiVOS~\cite{mivos} & CVPR2021 &Scribble &8 Rounds &91.0 &89.6 &92.4 &78.6 &74.9 &82.2\\
+    % \midrule
+    % & ICIP2022 &Click & \\
+    \midrule
+    TAM (Proposed) &- & Click & One Pass & 88.4 & 87.5 &89.4 & 73.1 & 69.8 & 76.4\\
+    % Ours & & 5 Clicks & \\
+    \bottomrule
+  \end{tabular}
+\end{table}
+\subsection{Implementation}\label{implementation}
+Inspired by SAM, we consider tracking anything in videos.
+We aim to define this task with high interactivity and ease of use.
+It leads to ease of use and is able to obtain high performance with very little human interaction effort.
+Figure~\ref{fig:overview} shows the pipeline of our Track Anything Model (TAM).
+As shown, we divide our Track-Anything process into the following four steps:
+\textbf{Step 1: Initialization with SAM~\cite{sam}.}
+As SAM provides us an opportunity to segment a region of interest with weak prompts, \textit{e.g.}, points, and bounding boxes, we use it to give an initial mask of the target object.
+Following SAM, users can get a mask description of the interested object by a click or modify the object mask with several clicks to get a satisfactory initialization.
+\textbf{Step 2: Tracking with XMem~\cite{xmem}.}
+Given the initialized mask, XMem performs semi-supervised VOS on the following frames.
+Since XMem is an advanced VOS method that can output satisfactory results on simple scenarios, we output the predicted masks of XMem on most occasions.
+When the mask quality is not such good, we save the XMem predictions and corresponding intermediate parameters, \textit{i.e.}, probes and affinities, and skip to step 3.
+% Given the initialized mask and the whole sequence, XMem performs semi-supervised VOS, which aims to solve the performance decay in long-term prediction with memory potentiation.
+\textbf{Step 3: Refinement with SAM~\cite{sam}.}
+We notice that during the inference of VOS models, keep predicting consistent and precise masks are challenging.
+In fact, most state-of-the-art VOS models tend to segment more and more coarsely over time during inference.
+Therefore, we utilize SAM to refine the masks predicted by XMem when its quality assessment is not satisfactory.
+Specifically, we project the probes and affinities to be point prompts for SAM, and the predicted mask from Step 2 is used as a mask prompt for SAM.
+Then, with these prompts, SAM is able to produce a refined segmentation mask.
+Such refined masks will also be added to the temporal correspondence of XMem to refine all subsequent object discrimination.
+\textbf{Step 4: Correction with human participation.}
+% Long video annotation.
+After the above three steps, the TAM can now successfully solve some common challenges and predict segmentation masks.
+However, we notice that it is still difficult to accurately distinguish the objects in some extremely challenging scenarios, especially when processing long videos.
+Therefore, we propose to add human correction during inference, which can bring a qualitative leap in performance with only very small human efforts.
+In detail, users can compulsively stop the TAM process and correct the mask of the current frame with positive and negative clicks.
+\section{Experiments}
+\subsection{Quantitative Results}
+To evaluate TAM, we utilize the validation set of DAVIS-2016 and test-development set of DAVIS-2017~\cite{davis}.
+% The evaluation process follows the one we proposed in Section~\ref{implementation}.
+Then, we execute the proposed TAM as demonstrated in Section~\ref{implementation}.
+The results are given in Table~\ref{davis1617}.
+As shown, our TAM obtains $J\&F$ scores of 88.4 and 73.1 on DAVIS-2016-val and DAVIS-2017-test-dev datasets, respectively.
+Note that TAM is initialized by clicks and evaluated in one pass.
+Notably, we found that TAM performs well when against difficult and complex scenarios.
+% During the evaluation,
+% click-based interactive video object segmentation
+% CLICK-BASED INTERACTIVE VIDEO OBJECT
+% SEGMENTATION
+\begin{figure}[t]
+\centering
+\includegraphics[width=\linewidth]{figs/davisresults.pdf}
+\caption{Qualitative results on video sequences from DAVIS-16 and DAVIS-17 datasets~\cite{davis}.}
+\label{fig:davisresult}
+\end{figure}
+\begin{figure}[t]
+\centering
+\includegraphics[width=\linewidth]{figs/failedcases.pdf}
+\caption{Failed cases.}
+\label{fig:failedcases}
+\end{figure}
+\subsection{Qualitative Results}
+% As we use a new one-pass interactive method to evaluation our TAM, here we only present some qualitative results.
+We also give some qualitative results in Figure~\ref{fig:davisresult}.
+As shown, TAM can handle multi-object separation, target deformation, scale change, and camera motion well, which demonstrates its superior tracking and segmentation abilities within only click initialization and one-round inference.
+\subsection{Failed Cases}
+We here also analyze the failed cases, as shown in Figure~\ref{fig:failedcases}.
+Overall, we notice that the failed cases typically appear on the following two occasions.
+1)
+% Separated masks of one object in a long video.
+Current VOS models are mostly designed for short videos, which focus more on maintaining short-term memory rather than long-term memory.
+This leads to mask shrinkage or lacking refinement in long-term videos, as shown in seq (a).
+Essentially, we aim to solve them in step 3 by the refinement ability of SAM, while its effectiveness is lower than expected in realistic applications.
+It indicates that the ability of SAM refinement based on multiple prompts can be further improved in the future.
+On the other hand, human participation/interaction in TAM can be an approach to solving such difficulties, while too much interaction will also result in low efficiency.
+Thus, the mechanism of long-term memory preserving and transient memory updating is still important.
+% Limited refinement by SAM. Although SAM supports to refine previous predictions, via point and mask prompts, . How to .
+2) When the object structure is complex, \textit{e.g.}, the bicycle wheels in seq (b) contain many cavities in groundtruth masks. We found it very difficult to get a fine-grained initialized mask by propagating the clicks.
+Thus, the coarse initialized masks may have side effects on the subsequent frames and lead to poor predictions.
+This also inspires us that SAM is still struggling with complex and precision structures.
+\begin{figure}[t]
+\centering
+\includegraphics[width=\linewidth]{figs/avengers_1.pdf}
+\caption{Raw frames, object masks, and inpainted results from the movie \textit{Captain America: Civil War (2016)}.}
+\label{fig:captain}
+\end{figure}
+\section{Applications}
+The proposed Track Anything Model (TAM) provides many possibilities for flexible tracking and segmentation in videos.
+Here, we demonstrate several applications enabled by our proposed method.
+% Our method may be able to a variety of applications.
+In such an interactive way, diverse downstream tasks can be easily achieved.
+% \textbf{Demo.}
+% It is able to solve diverse downstream tasks in such a interactive way.
+\textbf{Efficient video annotation.}
+TAM has the ability to segment the regions of interest in videos and flexibly choose the objects users want to track. Thus, it can be used for video annotation for tasks like video object tracking and video object segmentation.
+On the other hand, click-based interaction makes it easy to use, and the annotation process is of high efficiency.
+\textbf{Long-term object tracking.}
+The study of long-term tracking is gaining more and more attention because it is much closer to practical applications.
+Current long-term object tracking task requires the tracker to have the ability to handle target disappearance and reappearance while it is still limited in the scope of trimmed videos.
+Our TAM is more advanced in real-world applications which can handle the shot changes in long videos.
+\textbf{User-friendly video editing.}
+Track Anything Model provides us the opportunities to segment objects
+With the object segmentation masks provided by TAM, we are then able to remove or alter any of the existing objects in a given video.
+Here we combine E$^2$FGVI~\cite{e2fgvi} to evaluate its application value.
+\textbf{Visualized development toolkit for video tasks.}
+For ease of use, we also provide visualized interfaces for multiple video tasks, \textit{e.g.}, VOS, VOT, video inpainting, and so on.
+With the provided toolkit, users can apply their models on real-world videos and visualize the results instantaneously.
+Corresponding demos are available in Hugging Face\footnote{\url{https://huggingface.co/spaces/watchtowerss/Track-Anything}}.
+To show the effectiveness, we give a comprehensive test by applying TAM on the movie \textit{Captain America: Civil War (2016)}.
+Some representative results are given in Figure \ref{fig:captain}.
+As shown, TAM can present multiple object tracking precisely in videos with lots of shot changes and can further be helpful in video inpainting.
+% \section{Further work}
+% \section*{Acknowledgements}
+% \appendix
+% \section{Appendix}
+% Optionally include extra information (complete proofs, additional experiments and plots) in the appendix.
+% This section will often be part of the supplemental material.
+\bibliographystyle{plain}
+\bibliography{neurips_2022}
+\end{document}

requirements.txt CHANGED Viewed

@@ -12,5 +12,8 @@ pycocotools
 matplotlib
 onnxruntime
 onnx
 pyyaml
 av

 matplotlib
 onnxruntime
 onnx
+metaseg==0.6.1
 pyyaml
 av
+mmcv-full
+mmengine

test_sample/test-sample13.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bf112202beb75ecf7d04b27758f1f3eedfc218dac5d5dad0b72a07dd2db0f423
+size 59659465

test_sample/test-sample2.mp4 ADDED Viewed

Binary file (473 kB). View file

test_sample/test-sample4.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3d739a4b1a0ef3f5b50a9d26b2e767dcc590e6f5463805fc1f659e09d618d4ad
+size 1366182

test_sample/test-sample8.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:01d255ef82222d950d2cfae904d82cc20c752577016f0325b21788fb9b458bb9
+size 11979994

track_anything.py CHANGED Viewed

@@ -1,20 +1,18 @@
-import sys
-sys.path.append("/hhd3/gaoshang/Track-Anything/tracker")
 import PIL
 from tools.interact_tools import SamControler
 from tracker.base_tracker import BaseTracker
 import numpy as np
 import argparse
 class TrackingAnything():
-    def __init__(self, sam_checkpoint, xmem_checkpoint, args):
         self.args = args
         self.samcontroler = SamControler(sam_checkpoint, args.sam_model_type, args.device)
         self.xmem = BaseTracker(xmem_checkpoint, device=args.device)
     # def inference_step(self, first_flag: bool, interact_flag: bool, image: np.ndarray,
     #                    same_image_flag: bool, points:np.ndarray, labels: np.ndarray, logits: np.ndarray=None, multimask=True):
     #     if first_flag:
@@ -63,7 +61,7 @@ def parse_augment():
     parser.add_argument('--sam_model_type', type=str, default="vit_h")
     parser.add_argument('--port', type=int, default=6080, help="only useful when running gradio applications")
     parser.add_argument('--debug', action="store_true")
-    parser.add_argument('--mask_save', default=True)
     args = parser.parse_args()
     if args.debug:

 import PIL
 from tools.interact_tools import SamControler
 from tracker.base_tracker import BaseTracker
+from inpainter.base_inpainter import BaseInpainter
 import numpy as np
 import argparse
 class TrackingAnything():
+    def __init__(self, sam_checkpoint, xmem_checkpoint, e2fgvi_checkpoint, args):
         self.args = args
         self.samcontroler = SamControler(sam_checkpoint, args.sam_model_type, args.device)
         self.xmem = BaseTracker(xmem_checkpoint, device=args.device)
+        self.baseinpainter = BaseInpainter(e2fgvi_checkpoint, args.device)
     # def inference_step(self, first_flag: bool, interact_flag: bool, image: np.ndarray,
     #                    same_image_flag: bool, points:np.ndarray, labels: np.ndarray, logits: np.ndarray=None, multimask=True):
     #     if first_flag:
     parser.add_argument('--sam_model_type', type=str, default="vit_h")
     parser.add_argument('--port', type=int, default=6080, help="only useful when running gradio applications")
     parser.add_argument('--debug', action="store_true")
+    parser.add_argument('--mask_save', default=False)
     args = parser.parse_args()
     if args.debug:

tracker/.DS_Store CHANGED Viewed

Binary files a/tracker/.DS_Store and b/tracker/.DS_Store differ

tracker/base_tracker.py CHANGED Viewed

@@ -9,14 +9,14 @@ import yaml
 import torch.nn.functional as F
 from model.network import XMem
 from inference.inference_core import InferenceCore
-from util.mask_mapper import MaskMapper
 from torchvision import transforms
-from util.range_transform import im_normalization
-import sys
-sys.path.insert(0, sys.path[0]+"/../")
 from tools.painter import mask_painter
 from tools.base_segmenter import BaseSegmenter
 from torchvision.transforms import Resize
 class BaseTracker:
@@ -101,6 +101,8 @@ class BaseTracker:
                 continue
             painted_image = mask_painter(painted_image, (final_mask==obj).astype('uint8'), mask_color=obj+1)
         return final_mask, final_mask, painted_image
     @torch.no_grad()
@@ -126,50 +128,65 @@ class BaseTracker:
         self.mapper.clear_labels()
 if __name__ == '__main__':
-    # video frames (multiple objects)
     video_path_list = glob.glob(os.path.join('/ssd1/gaomingqi/datasets/davis/JPEGImages/480p/horsejump-high', '*.jpg'))
     video_path_list.sort()
-    # first frame
-    first_frame_path = '/ssd1/gaomingqi/datasets/davis/Annotations/480p/horsejump-high/00000.png'
     # load frames
     frames = []
     for video_path in video_path_list:
         frames.append(np.array(Image.open(video_path).convert('RGB')))
-    frames = np.stack(frames, 0)    # N, H, W, C
     # load first frame annotation
     first_frame_annotation = np.array(Image.open(first_frame_path).convert('P'))    # H, W, C
-    # ----------------------------------------------------------
-    # initalise tracker
-    # ----------------------------------------------------------
-    device = 'cuda:4'
     XMEM_checkpoint = '/ssd1/gaomingqi/checkpoints/XMem-s012.pth'
-    SAM_checkpoint= '/ssd1/gaomingqi/checkpoints/sam_vit_h_4b8939.pth'
-    model_type = 'vit_h'
-    # sam_model = BaseSegmenter(SAM_checkpoint, model_type, device=device)
     tracker = BaseTracker(XMEM_checkpoint, device, None, device)
-    # # test for storage efficiency
-    # frames = np.load('/ssd1/gaomingqi/efficiency/efficiency.npy')
-    # first_frame_annotation = np.array(Image.open('/ssd1/gaomingqi/efficiency/template_mask.png'))
-    first_frame_annotation[first_frame_annotation==1] = 15
-    first_frame_annotation[first_frame_annotation==2] = 20
-    save_path = '/ssd1/gaomingqi/results/TrackA/multi-change1'
-    if not os.path.exists(save_path):
-        os.mkdir(save_path)
     for ti, frame in enumerate(frames):
         if ti == 0:
-            mask, prob, painted_image = tracker.track(frame, first_frame_annotation)
         else:
-            mask, prob, painted_image = tracker.track(frame)
-        # save
-        painted_image = Image.fromarray(painted_image)
-        painted_image.save(f'{save_path}/{ti:05d}.png')
     # tracker.clear_memory()
     # for ti, frame in enumerate(frames):
@@ -241,6 +258,3 @@ if __name__ == '__main__':
     #     prob = Image.fromarray((probs[1].cpu().numpy()*255).astype('uint8'))
     #     # prob.save(f'/ssd1/gaomingqi/failure/probs/{ti:05d}.png')

 import torch.nn.functional as F
 from model.network import XMem
 from inference.inference_core import InferenceCore
+from tracker.util.mask_mapper import MaskMapper
 from torchvision import transforms
+from tracker.util.range_transform import im_normalization
 from tools.painter import mask_painter
 from tools.base_segmenter import BaseSegmenter
 from torchvision.transforms import Resize
+import progressbar
 class BaseTracker:
                 continue
             painted_image = mask_painter(painted_image, (final_mask==obj).astype('uint8'), mask_color=obj+1)
+        # print(f'max memory allocated: {torch.cuda.max_memory_allocated()/(2**20)} MB')
         return final_mask, final_mask, painted_image
     @torch.no_grad()
         self.mapper.clear_labels()
+##  how to use:
+##  1/3) prepare device and xmem_checkpoint
+#   device = 'cuda:2'
+#   XMEM_checkpoint = '/ssd1/gaomingqi/checkpoints/XMem-s012.pth'
+##  2/3) initialise Base Tracker
+#   tracker = BaseTracker(XMEM_checkpoint, device, None, device)    # leave an interface for sam model (currently set None)
+##  3/3)
 if __name__ == '__main__':
+    # video frames (take videos from DAVIS-2017 as examples)
     video_path_list = glob.glob(os.path.join('/ssd1/gaomingqi/datasets/davis/JPEGImages/480p/horsejump-high', '*.jpg'))
     video_path_list.sort()
     # load frames
     frames = []
     for video_path in video_path_list:
         frames.append(np.array(Image.open(video_path).convert('RGB')))
+    frames = np.stack(frames, 0)    # T, H, W, C
     # load first frame annotation
+    first_frame_path = '/ssd1/gaomingqi/datasets/davis/Annotations/480p/horsejump-high/00000.png'
     first_frame_annotation = np.array(Image.open(first_frame_path).convert('P'))    # H, W, C
+    # ------------------------------------------------------------------------------------
+    # how to use
+    # ------------------------------------------------------------------------------------
+    # 1/4: set checkpoint and device
+    device = 'cuda:2'
     XMEM_checkpoint = '/ssd1/gaomingqi/checkpoints/XMem-s012.pth'
+    # SAM_checkpoint= '/ssd1/gaomingqi/checkpoints/sam_vit_h_4b8939.pth'
+    # model_type = 'vit_h'
+    # ------------------------------------------------------------------------------------
+    # 2/4: initialise inpainter
     tracker = BaseTracker(XMEM_checkpoint, device, None, device)
+    # ------------------------------------------------------------------------------------
+    # 3/4: for each frame, get tracking results by tracker.track(frame, first_frame_annotation)
+    # frame: numpy array (H, W, C), first_frame_annotation: numpy array (H, W), leave it blank when tracking begins
+    painted_frames = []
     for ti, frame in enumerate(frames):
         if ti == 0:
+            mask, prob, painted_frame = tracker.track(frame, first_frame_annotation)
+            # mask:
         else:
+            mask, prob, painted_frame = tracker.track(frame)
+        painted_frames.append(painted_frame)
+    # ----------------------------------------------
+    # 3/4: clear memory in XMEM for the next video
+    tracker.clear_memory()
+    # ----------------------------------------------
+    # end
+    # ----------------------------------------------
+    print(f'max memory allocated: {torch.cuda.max_memory_allocated()/(2**20)} MB')
+    # set saving path
+    save_path = '/ssd1/gaomingqi/results/TAM/blackswan'
+    if not os.path.exists(save_path):
+        os.mkdir(save_path)
+    # save
+    for painted_frame in progressbar.progressbar(painted_frames):
+        painted_frame = Image.fromarray(painted_frame)
+        painted_frame.save(f'{save_path}/{ti:05d}.png')
     # tracker.clear_memory()
     # for ti, frame in enumerate(frames):
     #     prob = Image.fromarray((probs[1].cpu().numpy()*255).astype('uint8'))
     #     # prob.save(f'/ssd1/gaomingqi/failure/probs/{ti:05d}.png')

tracker/inference/inference_core.py CHANGED Viewed

@@ -2,7 +2,7 @@ from inference.memory_manager import MemoryManager
 from model.network import XMem
 from model.aggregate import aggregate
-from util.tensor_util import pad_divide_by, unpad
 class InferenceCore:

 from model.network import XMem
 from model.aggregate import aggregate
+from tracker.util.tensor_util import pad_divide_by, unpad
 class InferenceCore: