watchtowerss commited on
Commit
bb879e5
1 Parent(s): 54438f1

add inpaint and example

Browse files
.gitattributes CHANGED
@@ -36,3 +36,10 @@ assets/demo_version_1.MP4 filter=lfs diff=lfs merge=lfs -text
36
  assets/inpainting.gif filter=lfs diff=lfs merge=lfs -text
37
  assets/qingming.mp4 filter=lfs diff=lfs merge=lfs -text
38
  test_sample/test-sample1.mp4 filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
36
  assets/inpainting.gif filter=lfs diff=lfs merge=lfs -text
37
  assets/qingming.mp4 filter=lfs diff=lfs merge=lfs -text
38
  test_sample/test-sample1.mp4 filter=lfs diff=lfs merge=lfs -text
39
+ assets/avengers.gif filter=lfs diff=lfs merge=lfs -text
40
+ overleaf/Track[[:space:]]Anything/figs/avengers_1.pdf filter=lfs diff=lfs merge=lfs -text
41
+ overleaf/Track[[:space:]]Anything/figs/davisresults.pdf filter=lfs diff=lfs merge=lfs -text
42
+ overleaf/Track[[:space:]]Anything/figs/failedcases.pdf filter=lfs diff=lfs merge=lfs -text
43
+ test_sample/test-sample13.mp4 filter=lfs diff=lfs merge=lfs -text
44
+ test_sample/test-sample4.mp4 filter=lfs diff=lfs merge=lfs -text
45
+ test_sample/test-sample8.mp4 filter=lfs diff=lfs merge=lfs -text
app.py CHANGED
@@ -1,8 +1,7 @@
1
  import gradio as gr
2
  import argparse
 
3
  import cv2
4
- import time
5
- from PIL import Image
6
  import numpy as np
7
  import os
8
  import sys
@@ -14,9 +13,8 @@ import requests
14
  import json
15
  import torchvision
16
  import torch
17
- import concurrent.futures
18
- import queue
19
- from tools.painter import mask_painter, point_painter
20
  # download checkpoints
21
  def download_checkpoint(url, folder, filename):
22
  os.makedirs(folder, exist_ok=True)
@@ -34,6 +32,19 @@ def download_checkpoint(url, folder, filename):
34
 
35
  return filepath
36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  # convert points input to prompt state
38
  def get_prompt(click_state, click_input):
39
  inputs = json.loads(click_input)
@@ -74,18 +85,18 @@ def get_frames_from_video(video_input, video_state):
74
  break
75
  except (OSError, TypeError, ValueError, KeyError, SyntaxError) as e:
76
  print("read_frame_source:{} error. {}\n".format(video_path, str(e)))
77
-
78
  # initialize video_state
79
  video_state = {
80
  "video_name": os.path.split(video_path)[-1],
81
  "origin_images": frames,
82
  "painted_images": frames.copy(),
83
- "masks": [None]*len(frames),
84
  "logits": [None]*len(frames),
85
  "select_frame_number": 0,
86
  "fps": fps
87
  }
88
- video_info = "Video Name: {}, FPS: {}, Total Frames: {}".format(video_state["video_name"], video_state["fps"], len(frames))
89
 
90
  model.samcontroler.sam_controler.reset_image()
91
  model.samcontroler.sam_controler.set_image(video_state["origin_images"][0])
@@ -94,8 +105,10 @@ def get_frames_from_video(video_input, video_state):
94
  gr.update(visible=True), gr.update(visible=True), \
95
  gr.update(visible=True), gr.update(visible=True), \
96
  gr.update(visible=True), gr.update(visible=True), \
97
- gr.update(visible=True)
98
 
 
 
99
  # get the select frame from gradio slider
100
  def select_template(image_selection_slider, video_state, interactive_state):
101
 
@@ -108,13 +121,22 @@ def select_template(image_selection_slider, video_state, interactive_state):
108
  model.samcontroler.sam_controler.reset_image()
109
  model.samcontroler.sam_controler.set_image(video_state["origin_images"][image_selection_slider])
110
 
111
- # # clear multi mask
112
- # interactive_state["multi_mask"] = {"masks":[], "mask_names":[]}
 
 
113
 
114
  return video_state["painted_images"][image_selection_slider], video_state, interactive_state
115
 
116
- def get_end_number(track_pause_number_slider, interactive_state):
 
117
  interactive_state["track_end_number"] = track_pause_number_slider
 
 
 
 
 
 
118
  return interactive_state
119
 
120
  # use sam to get the mask
@@ -207,7 +229,7 @@ def vos_tracking_video(video_state, interactive_state, mask_dropdown):
207
  video_state["logits"][video_state["select_frame_number"]:] = logits
208
  video_state["painted_images"][video_state["select_frame_number"]:] = painted_images
209
 
210
- video_output = generate_video_from_frames(video_state["painted_images"], output_path="./result/{}".format(video_state["video_name"]), fps=fps) # import video_input to name the output video
211
  interactive_state["inference_times"] += 1
212
 
213
  print("For generating this tracking result, inference times: {}, click times: {}, positive: {}, negative: {}".format(interactive_state["inference_times"],
@@ -228,6 +250,36 @@ def vos_tracking_video(video_state, interactive_state, mask_dropdown):
228
  #### shanggao code for mask save
229
  return video_output, video_state, interactive_state
230
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
231
  # generate video after vos inference
232
  def generate_video_from_frames(frames, output_path, fps=30):
233
  """
@@ -257,17 +309,21 @@ SAM_checkpoint = "sam_vit_h_4b8939.pth"
257
  sam_checkpoint_url = "https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth"
258
  xmem_checkpoint = "XMem-s012.pth"
259
  xmem_checkpoint_url = "https://github.com/hkchengrex/XMem/releases/download/v1.0/XMem-s012.pth"
 
 
 
260
  folder ="./checkpoints"
261
  SAM_checkpoint = download_checkpoint(sam_checkpoint_url, folder, SAM_checkpoint)
262
  xmem_checkpoint = download_checkpoint(xmem_checkpoint_url, folder, xmem_checkpoint)
263
-
264
  # args, defined in track_anything.py
265
  args = parse_augment()
266
  # args.port = 12315
267
- # args.device = "cuda:1"
268
  # args.mask_save = True
269
 
270
- model = TrackingAnything(SAM_checkpoint, xmem_checkpoint, args)
 
271
 
272
  with gr.Blocks() as iface:
273
  """
@@ -283,7 +339,8 @@ with gr.Blocks() as iface:
283
  "mask_names": [],
284
  "masks": []
285
  },
286
- "track_end_number": None
 
287
  }
288
  )
289
 
@@ -293,6 +350,7 @@ with gr.Blocks() as iface:
293
  "origin_images": None,
294
  "painted_images": None,
295
  "masks": None,
 
296
  "logits": None,
297
  "select_frame_number": 0,
298
  "fps": 30
@@ -305,8 +363,11 @@ with gr.Blocks() as iface:
305
  with gr.Column():
306
  with gr.Row(scale=0.4):
307
  video_input = gr.Video(autosize=True)
308
- video_info = gr.Textbox()
309
-
 
 
 
310
 
311
 
312
  with gr.Row():
@@ -342,7 +403,9 @@ with gr.Blocks() as iface:
342
  mask_dropdown = gr.Dropdown(multiselect=True, value=[], label="Mask_select", info=".", visible=False)
343
  remove_mask_button = gr.Button(value="Remove mask", interactive=True, visible=False)
344
  video_output = gr.Video(autosize=True, visible=False).style(height=360)
345
- tracking_video_predict_button = gr.Button(value="Tracking", visible=False)
 
 
346
 
347
  # first step: get the video information
348
  extract_frames_button.click(
@@ -352,7 +415,7 @@ with gr.Blocks() as iface:
352
  ],
353
  outputs=[video_state, video_info, template_frame,
354
  image_selection_slider, track_pause_number_slider,point_prompt, click_mode, clear_button_click, Add_mask_button, template_frame,
355
- tracking_video_predict_button, video_output, mask_dropdown, remove_mask_button]
356
  )
357
 
358
  # second step: select images from slider
@@ -360,8 +423,11 @@ with gr.Blocks() as iface:
360
  inputs=[image_selection_slider, video_state, interactive_state],
361
  outputs=[template_frame, video_state, interactive_state], api_name="select_image")
362
  track_pause_number_slider.release(fn=get_end_number,
363
- inputs=[track_pause_number_slider, interactive_state],
364
- outputs=[interactive_state], api_name="end_image")
 
 
 
365
 
366
  # click select image to get mask using sam
367
  template_frame.select(
@@ -390,6 +456,13 @@ with gr.Blocks() as iface:
390
  outputs=[video_output, video_state, interactive_state]
391
  )
392
 
 
 
 
 
 
 
 
393
  # click to get mask
394
  mask_dropdown.change(
395
  fn=show_mask,
@@ -404,6 +477,7 @@ with gr.Blocks() as iface:
404
  "origin_images": None,
405
  "painted_images": None,
406
  "masks": None,
 
407
  "logits": None,
408
  "select_frame_number": 0,
409
  "fps": 30
@@ -417,14 +491,15 @@ with gr.Blocks() as iface:
417
  "mask_names": [],
418
  "masks": []
419
  },
420
- "track_end_number": 0
 
421
  },
422
  [[],[]],
423
  None,
424
  None,
425
  gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), \
426
  gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), \
427
- gr.update(visible=False), gr.update(visible=False), gr.update(visible=False, value=[]), gr.update(visible=False) \
428
 
429
  ),
430
  [],
@@ -435,7 +510,7 @@ with gr.Blocks() as iface:
435
  video_output,
436
  template_frame,
437
  tracking_video_predict_button, image_selection_slider , track_pause_number_slider,point_prompt, click_mode, clear_button_click,
438
- Add_mask_button, template_frame, tracking_video_predict_button, video_output, mask_dropdown, remove_mask_button
439
  ],
440
  queue=False,
441
  show_progress=False)
@@ -445,10 +520,21 @@ with gr.Blocks() as iface:
445
  fn = clear_click,
446
  inputs = [video_state, click_state,],
447
  outputs = [template_frame,click_state],
448
-
 
 
 
 
 
 
 
 
 
 
 
449
  )
450
  iface.queue(concurrency_count=1)
451
- iface.launch(enable_queue=True)
452
 
453
 
454
 
1
  import gradio as gr
2
  import argparse
3
+ import gdown
4
  import cv2
 
 
5
  import numpy as np
6
  import os
7
  import sys
13
  import json
14
  import torchvision
15
  import torch
16
+ from tools.painter import mask_painter
17
+
 
18
  # download checkpoints
19
  def download_checkpoint(url, folder, filename):
20
  os.makedirs(folder, exist_ok=True)
32
 
33
  return filepath
34
 
35
+ def download_checkpoint_from_google_drive(file_id, folder, filename):
36
+ os.makedirs(folder, exist_ok=True)
37
+ filepath = os.path.join(folder, filename)
38
+
39
+ if not os.path.exists(filepath):
40
+ print("Downloading checkpoints from Google Drive... tips: If you cannot see the progress bar, please try to download it manuall \
41
+ and put it in the checkpointes directory. E2FGVI-HQ-CVPR22.pth: https://github.com/MCG-NKU/E2FGVI(E2FGVI-HQ model)")
42
+ url = f"https://drive.google.com/uc?id={file_id}"
43
+ gdown.download(url, filepath, quiet=False)
44
+ print("Downloaded successfully!")
45
+
46
+ return filepath
47
+
48
  # convert points input to prompt state
49
  def get_prompt(click_state, click_input):
50
  inputs = json.loads(click_input)
85
  break
86
  except (OSError, TypeError, ValueError, KeyError, SyntaxError) as e:
87
  print("read_frame_source:{} error. {}\n".format(video_path, str(e)))
88
+ image_size = (frames[0].shape[0],frames[0].shape[1])
89
  # initialize video_state
90
  video_state = {
91
  "video_name": os.path.split(video_path)[-1],
92
  "origin_images": frames,
93
  "painted_images": frames.copy(),
94
+ "masks": [np.zeros((frames[0].shape[0],frames[0].shape[1]), np.uint8)]*len(frames),
95
  "logits": [None]*len(frames),
96
  "select_frame_number": 0,
97
  "fps": fps
98
  }
99
+ video_info = "Video Name: {}, FPS: {}, Total Frames: {}, Image Size:{}".format(video_state["video_name"], video_state["fps"], len(frames), image_size)
100
 
101
  model.samcontroler.sam_controler.reset_image()
102
  model.samcontroler.sam_controler.set_image(video_state["origin_images"][0])
105
  gr.update(visible=True), gr.update(visible=True), \
106
  gr.update(visible=True), gr.update(visible=True), \
107
  gr.update(visible=True), gr.update(visible=True), \
108
+ gr.update(visible=True), gr.update(visible=True)
109
 
110
+ def run_example(example):
111
+ return video_input
112
  # get the select frame from gradio slider
113
  def select_template(image_selection_slider, video_state, interactive_state):
114
 
121
  model.samcontroler.sam_controler.reset_image()
122
  model.samcontroler.sam_controler.set_image(video_state["origin_images"][image_selection_slider])
123
 
124
+ # update the masks when select a new template frame
125
+ # if video_state["masks"][image_selection_slider] is not None:
126
+ # video_state["painted_images"][image_selection_slider] = mask_painter(video_state["origin_images"][image_selection_slider], video_state["masks"][image_selection_slider])
127
+
128
 
129
  return video_state["painted_images"][image_selection_slider], video_state, interactive_state
130
 
131
+ # set the tracking end frame
132
+ def get_end_number(track_pause_number_slider, video_state, interactive_state):
133
  interactive_state["track_end_number"] = track_pause_number_slider
134
+
135
+ return video_state["painted_images"][track_pause_number_slider],interactive_state
136
+
137
+ def get_resize_ratio(resize_ratio_slider, interactive_state):
138
+ interactive_state["resize_ratio"] = resize_ratio_slider
139
+
140
  return interactive_state
141
 
142
  # use sam to get the mask
229
  video_state["logits"][video_state["select_frame_number"]:] = logits
230
  video_state["painted_images"][video_state["select_frame_number"]:] = painted_images
231
 
232
+ video_output = generate_video_from_frames(video_state["painted_images"], output_path="./result/track/{}".format(video_state["video_name"]), fps=fps) # import video_input to name the output video
233
  interactive_state["inference_times"] += 1
234
 
235
  print("For generating this tracking result, inference times: {}, click times: {}, positive: {}, negative: {}".format(interactive_state["inference_times"],
250
  #### shanggao code for mask save
251
  return video_output, video_state, interactive_state
252
 
253
+ # extracting masks from mask_dropdown
254
+ # def extract_sole_mask(video_state, mask_dropdown):
255
+ # combined_masks =
256
+ # unique_masks = np.unique(combined_masks)
257
+ # return 0
258
+
259
+ # inpaint
260
+ def inpaint_video(video_state, interactive_state, mask_dropdown):
261
+ frames = np.asarray(video_state["origin_images"])
262
+ fps = video_state["fps"]
263
+ inpaint_masks = np.asarray(video_state["masks"])
264
+ if len(mask_dropdown) == 0:
265
+ mask_dropdown = ["mask_001"]
266
+ mask_dropdown.sort()
267
+ # convert mask_dropdown to mask numbers
268
+ inpaint_mask_numbers = [int(mask_dropdown[i].split("_")[1]) for i in range(len(mask_dropdown))]
269
+ # interate through all masks and remove the masks that are not in mask_dropdown
270
+ unique_masks = np.unique(inpaint_masks)
271
+ num_masks = len(unique_masks) - 1
272
+ for i in range(1, num_masks + 1):
273
+ if i in inpaint_mask_numbers:
274
+ continue
275
+ inpaint_masks[inpaint_masks==i] = 0
276
+ # inpaint for videos
277
+ inpainted_frames = model.baseinpainter.inpaint(frames, inpaint_masks, ratio=interactive_state["resize_ratio"]) # numpy array, T, H, W, 3
278
+ video_output = generate_video_from_frames(inpainted_frames, output_path="./result/inpaint/{}".format(video_state["video_name"]), fps=fps) # import video_input to name the output video
279
+
280
+ return video_output
281
+
282
+
283
  # generate video after vos inference
284
  def generate_video_from_frames(frames, output_path, fps=30):
285
  """
309
  sam_checkpoint_url = "https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth"
310
  xmem_checkpoint = "XMem-s012.pth"
311
  xmem_checkpoint_url = "https://github.com/hkchengrex/XMem/releases/download/v1.0/XMem-s012.pth"
312
+ e2fgvi_checkpoint = "E2FGVI-HQ-CVPR22.pth"
313
+ e2fgvi_checkpoint_id = "10wGdKSUOie0XmCr8SQ2A2FeDe-mfn5w3"
314
+
315
  folder ="./checkpoints"
316
  SAM_checkpoint = download_checkpoint(sam_checkpoint_url, folder, SAM_checkpoint)
317
  xmem_checkpoint = download_checkpoint(xmem_checkpoint_url, folder, xmem_checkpoint)
318
+ e2fgvi_checkpoint = download_checkpoint_from_google_drive(e2fgvi_checkpoint_id, folder, e2fgvi_checkpoint)
319
  # args, defined in track_anything.py
320
  args = parse_augment()
321
  # args.port = 12315
322
+ # args.device = "cuda:2"
323
  # args.mask_save = True
324
 
325
+ # initialize sam, xmem, e2fgvi models
326
+ model = TrackingAnything(SAM_checkpoint, xmem_checkpoint, e2fgvi_checkpoint,args)
327
 
328
  with gr.Blocks() as iface:
329
  """
339
  "mask_names": [],
340
  "masks": []
341
  },
342
+ "track_end_number": None,
343
+ "resize_ratio": 1
344
  }
345
  )
346
 
350
  "origin_images": None,
351
  "painted_images": None,
352
  "masks": None,
353
+ "inpaint_masks": None,
354
  "logits": None,
355
  "select_frame_number": 0,
356
  "fps": 30
363
  with gr.Column():
364
  with gr.Row(scale=0.4):
365
  video_input = gr.Video(autosize=True)
366
+ with gr.Column():
367
+ video_info = gr.Textbox()
368
+ video_info = gr.Textbox(value="If you want to use the inpaint function, it is best to download and use a machine with more VRAM locally. \
369
+ Alternatively, you can use the resize ratio slider to scale down the original image to around 360P resolution for faster processing.")
370
+ resize_ratio_slider = gr.Slider(minimum=0.02, maximum=1, step=0.02, value=1, label="Resize ratio", visible=True)
371
 
372
 
373
  with gr.Row():
403
  mask_dropdown = gr.Dropdown(multiselect=True, value=[], label="Mask_select", info=".", visible=False)
404
  remove_mask_button = gr.Button(value="Remove mask", interactive=True, visible=False)
405
  video_output = gr.Video(autosize=True, visible=False).style(height=360)
406
+ with gr.Row():
407
+ tracking_video_predict_button = gr.Button(value="Tracking", visible=False)
408
+ inpaint_video_predict_button = gr.Button(value="Inpaint", visible=False)
409
 
410
  # first step: get the video information
411
  extract_frames_button.click(
415
  ],
416
  outputs=[video_state, video_info, template_frame,
417
  image_selection_slider, track_pause_number_slider,point_prompt, click_mode, clear_button_click, Add_mask_button, template_frame,
418
+ tracking_video_predict_button, video_output, mask_dropdown, remove_mask_button, inpaint_video_predict_button]
419
  )
420
 
421
  # second step: select images from slider
423
  inputs=[image_selection_slider, video_state, interactive_state],
424
  outputs=[template_frame, video_state, interactive_state], api_name="select_image")
425
  track_pause_number_slider.release(fn=get_end_number,
426
+ inputs=[track_pause_number_slider, video_state, interactive_state],
427
+ outputs=[template_frame, interactive_state], api_name="end_image")
428
+ resize_ratio_slider.release(fn=get_resize_ratio,
429
+ inputs=[resize_ratio_slider, interactive_state],
430
+ outputs=[interactive_state], api_name="resize_ratio")
431
 
432
  # click select image to get mask using sam
433
  template_frame.select(
456
  outputs=[video_output, video_state, interactive_state]
457
  )
458
 
459
+ # inpaint video from select image and mask
460
+ inpaint_video_predict_button.click(
461
+ fn=inpaint_video,
462
+ inputs=[video_state, interactive_state, mask_dropdown],
463
+ outputs=[video_output]
464
+ )
465
+
466
  # click to get mask
467
  mask_dropdown.change(
468
  fn=show_mask,
477
  "origin_images": None,
478
  "painted_images": None,
479
  "masks": None,
480
+ "inpaint_masks": None,
481
  "logits": None,
482
  "select_frame_number": 0,
483
  "fps": 30
491
  "mask_names": [],
492
  "masks": []
493
  },
494
+ "track_end_number": 0,
495
+ "resize_ratio": 1
496
  },
497
  [[],[]],
498
  None,
499
  None,
500
  gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), \
501
  gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), \
502
+ gr.update(visible=False), gr.update(visible=False), gr.update(visible=False, value=[]), gr.update(visible=False), gr.update(visible=False) \
503
 
504
  ),
505
  [],
510
  video_output,
511
  template_frame,
512
  tracking_video_predict_button, image_selection_slider , track_pause_number_slider,point_prompt, click_mode, clear_button_click,
513
+ Add_mask_button, template_frame, tracking_video_predict_button, video_output, mask_dropdown, remove_mask_button,inpaint_video_predict_button
514
  ],
515
  queue=False,
516
  show_progress=False)
520
  fn = clear_click,
521
  inputs = [video_state, click_state,],
522
  outputs = [template_frame,click_state],
523
+ )
524
+ # set example
525
+ gr.Markdown("## Examples")
526
+ gr.Examples(
527
+ examples=[os.path.join(os.path.dirname(__file__), "./test_sample/", test_sample) for test_sample in ["test-sample8.mp4","test-sample4.mp4", \
528
+ "test-sample2.mp4","test-sample13.mp4"]],
529
+ fn=run_example,
530
+ inputs=[
531
+ video_input
532
+ ],
533
+ outputs=[video_input],
534
+ # cache_examples=True,
535
  )
536
  iface.queue(concurrency_count=1)
537
+ iface.launch(debug=True, enable_queue=True, server_port=args.port, server_name="0.0.0.0")
538
 
539
 
540
 
assets/avengers.gif ADDED

Git LFS Details

  • SHA256: 9193a028c2e968ff7a7ee222ccc27166a5fbbe40a4d971cee13eba519134c5cf
  • Pointer size: 133 Bytes
  • Size of remote file: 99.2 MB
assets/track-anything-logo.jpg ADDED
checkpoints/E2FGVI-HQ-CVPR22.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:afff989d41205598a79ce24630b9c83af4b0a06f45b137979a25937d94c121a5
3
+ size 164535938
inpainter/.DS_Store CHANGED
Binary files a/inpainter/.DS_Store and b/inpainter/.DS_Store differ
inpainter/base_inpainter.py CHANGED
@@ -7,7 +7,7 @@ import yaml
7
  import cv2
8
  import importlib
9
  import numpy as np
10
- from util.tensor_util import resize_frames, resize_masks
11
 
12
 
13
  class BaseInpainter:
@@ -15,7 +15,7 @@ class BaseInpainter:
15
  """
16
  E2FGVI_checkpoint: checkpoint of inpainter (version hq, with multi-resolution support)
17
  """
18
- net = importlib.import_module('model.e2fgvi_hq')
19
  self.model = net.InpaintGenerator().to(device)
20
  self.model.load_state_dict(torch.load(E2FGVI_checkpoint, map_location=device))
21
  self.model.eval()
@@ -67,6 +67,10 @@ class BaseInpainter:
67
  size = None
68
  else:
69
  size = (int(W*ratio), int(H*ratio))
 
 
 
 
70
 
71
  masks = np.expand_dims(masks, axis=3) # expand to T, H, W, 1
72
  binary_masks = resize_masks(masks, size)
7
  import cv2
8
  import importlib
9
  import numpy as np
10
+ from inpainter.util.tensor_util import resize_frames, resize_masks
11
 
12
 
13
  class BaseInpainter:
15
  """
16
  E2FGVI_checkpoint: checkpoint of inpainter (version hq, with multi-resolution support)
17
  """
18
+ net = importlib.import_module('inpainter.model.e2fgvi_hq')
19
  self.model = net.InpaintGenerator().to(device)
20
  self.model.load_state_dict(torch.load(E2FGVI_checkpoint, map_location=device))
21
  self.model.eval()
67
  size = None
68
  else:
69
  size = (int(W*ratio), int(H*ratio))
70
+ if size[0] % 2 > 0:
71
+ size[0] += 1
72
+ if size[1] % 2 > 0:
73
+ size[1] += 1
74
 
75
  masks = np.expand_dims(masks, axis=3) # expand to T, H, W, 1
76
  binary_masks = resize_masks(masks, size)
inpainter/model/e2fgvi_hq.py CHANGED
@@ -5,10 +5,10 @@ import torch
5
  import torch.nn as nn
6
  import torch.nn.functional as F
7
 
8
- from model.modules.flow_comp import SPyNet
9
- from model.modules.feat_prop import BidirectionalPropagation, SecondOrderDeformableAlignment
10
- from model.modules.tfocal_transformer_hq import TemporalFocalTransformerBlock, SoftSplit, SoftComp
11
- from model.modules.spectral_norm import spectral_norm as _spectral_norm
12
 
13
 
14
  class BaseNetwork(nn.Module):
5
  import torch.nn as nn
6
  import torch.nn.functional as F
7
 
8
+ from inpainter.model.modules.flow_comp import SPyNet
9
+ from inpainter.model.modules.feat_prop import BidirectionalPropagation, SecondOrderDeformableAlignment
10
+ from inpainter.model.modules.tfocal_transformer_hq import TemporalFocalTransformerBlock, SoftSplit, SoftComp
11
+ from inpainter.model.modules.spectral_norm import spectral_norm as _spectral_norm
12
 
13
 
14
  class BaseNetwork(nn.Module):
inpainter/model/modules/feat_prop.py CHANGED
@@ -7,7 +7,7 @@ import torch.nn as nn
7
  from mmcv.ops import ModulatedDeformConv2d, modulated_deform_conv2d
8
  from mmengine.model import constant_init
9
 
10
- from model.modules.flow_comp import flow_warp
11
 
12
 
13
  class SecondOrderDeformableAlignment(ModulatedDeformConv2d):
7
  from mmcv.ops import ModulatedDeformConv2d, modulated_deform_conv2d
8
  from mmengine.model import constant_init
9
 
10
+ from inpainter.model.modules.flow_comp import flow_warp
11
 
12
 
13
  class SecondOrderDeformableAlignment(ModulatedDeformConv2d):
overleaf/.DS_Store ADDED
Binary file (6.15 kB). View file
overleaf/Track Anything.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d271378ac9538e322b362b43a41e2c22a21cffac6f539a0c3e5b140c3b24b47e
3
+ size 5370701
overleaf/Track Anything/figs/avengers_1.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a519eb00a2d315ecdc36b5a53e174e9b3361a9526c7fcd8a96bfefde2eeb940f
3
+ size 2570569
overleaf/Track Anything/figs/davisresults.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fefd74df3daafd48ffb72a725c43354712a244db70e6c5d7ae8773203e0be492
3
+ size 1349133
overleaf/Track Anything/figs/failedcases.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ccb662ff62914d05fe8dc99640b9f89b32847675dd2069900a27771569378aa4
3
+ size 1200242
overleaf/Track Anything/figs/overview_4.pdf ADDED
Binary file (424 kB). View file
overleaf/Track Anything/neurips_2022.bbl ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ \begin{thebibliography}{10}
2
+
3
+ \bibitem{xmem}
4
+ Ho~Kei Cheng and Alexander~G. Schwing.
5
+ \newblock Xmem: Long-term video object segmentation with an atkinson-shiffrin
6
+ memory model.
7
+ \newblock In {\em {ECCV} {(28)}}, volume 13688 of {\em Lecture Notes in
8
+ Computer Science}, pages 640--658. Springer, 2022.
9
+
10
+ \bibitem{mivos}
11
+ Ho~Kei Cheng, Yu{-}Wing Tai, and Chi{-}Keung Tang.
12
+ \newblock Modular interactive video object segmentation: Interaction-to-mask,
13
+ propagation and difference-aware fusion.
14
+ \newblock In {\em {CVPR}}, pages 5559--5568. Computer Vision Foundation /
15
+ {IEEE}, 2021.
16
+
17
+ \bibitem{vit}
18
+ Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn,
19
+ Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg
20
+ Heigold, Sylvain Gelly, Jakob Uszkoreit, and Neil Houlsby.
21
+ \newblock An image is worth 16x16 words: Transformers for image recognition at
22
+ scale.
23
+ \newblock In {\em {ICLR}}. OpenReview.net, 2021.
24
+
25
+ \bibitem{vos}
26
+ Mingqi Gao, Feng Zheng, James J.~Q. Yu, Caifeng Shan, Guiguang Ding, and
27
+ Jungong Han.
28
+ \newblock Deep learning for video object segmentation: a review.
29
+ \newblock {\em Artif. Intell. Rev.}, 56(1):457--531, 2023.
30
+
31
+ \bibitem{sam}
32
+ Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura
33
+ Gustafson, Tete Xiao, Spencer Whitehead, Alexander~C Berg, Wan-Yen Lo, et~al.
34
+ \newblock Segment anything.
35
+ \newblock {\em arXiv preprint arXiv:2304.02643}, 2023.
36
+
37
+ \bibitem{vot10}
38
+ Matej Kristan, Ale{\v{s}} Leonardis, Ji{\v{r}}{\'\i} Matas, Michael Felsberg,
39
+ Roman Pflugfelder, Joni-Kristian K{\"a}m{\"a}r{\"a}inen, Hyung~Jin Chang,
40
+ Martin Danelljan, Luka~{\v{C}}ehovin Zajc, Alan Luke{\v{z}}i{\v{c}}, et~al.
41
+ \newblock The tenth visual object tracking vot2022 challenge results.
42
+ \newblock In {\em Computer Vision--ECCV 2022 Workshops: Tel Aviv, Israel,
43
+ October 23--27, 2022, Proceedings, Part VIII}, pages 431--460. Springer,
44
+ 2023.
45
+
46
+ \bibitem{vot8}
47
+ Matej Kristan, Ale{\v{s}} Leonardis, Ji{\v{r}}{\'\i} Matas, Michael Felsberg,
48
+ Roman Pflugfelder, Joni-Kristian K{\"a}m{\"a}r{\"a}inen, Martin Danelljan,
49
+ Luka~{\v{C}}ehovin Zajc, Alan Luke{\v{z}}i{\v{c}}, Ondrej Drbohlav, et~al.
50
+ \newblock The eighth visual object tracking vot2020 challenge results.
51
+ \newblock In {\em European Conference on Computer Vision}, pages 547--601.
52
+ Springer, 2020.
53
+
54
+ \bibitem{vot6}
55
+ Matej Kristan, Ales Leonardis, Jiri Matas, Michael Felsberg, Roman Pflugfelder,
56
+ Luka ˇCehovin~Zajc, Tomas Vojir, Goutam Bhat, Alan Lukezic, Abdelrahman
57
+ Eldesokey, et~al.
58
+ \newblock The sixth visual object tracking vot2018 challenge results.
59
+ \newblock In {\em Proceedings of the European Conference on Computer Vision
60
+ (ECCV) Workshops}, pages 0--0, 2018.
61
+
62
+ \bibitem{vot9}
63
+ Matej Kristan, Ji{\v{r}}{\'\i} Matas, Ale{\v{s}} Leonardis, Michael Felsberg,
64
+ Roman Pflugfelder, Joni-Kristian K{\"a}m{\"a}r{\"a}inen, Hyung~Jin Chang,
65
+ Martin Danelljan, Luka Cehovin, Alan Luke{\v{z}}i{\v{c}}, et~al.
66
+ \newblock The ninth visual object tracking vot2021 challenge results.
67
+ \newblock In {\em Proceedings of the IEEE/CVF International Conference on
68
+ Computer Vision}, pages 2711--2738, 2021.
69
+
70
+ \bibitem{vot7}
71
+ Matej Kristan, Jiri Matas, Ales Leonardis, Michael Felsberg, Roman Pflugfelder,
72
+ Joni-Kristian Kamarainen, Luka ˇCehovin~Zajc, Ondrej Drbohlav, Alan Lukezic,
73
+ Amanda Berg, et~al.
74
+ \newblock The seventh visual object tracking vot2019 challenge results.
75
+ \newblock In {\em Proceedings of the IEEE/CVF International Conference on
76
+ Computer Vision Workshops}, pages 0--0, 2019.
77
+
78
+ \bibitem{e2fgvi}
79
+ Zhen Li, Chengze Lu, Jianhua Qin, Chun{-}Le Guo, and Ming{-}Ming Cheng.
80
+ \newblock Towards an end-to-end framework for flow-guided video inpainting.
81
+ \newblock In {\em {CVPR}}, pages 17541--17550. {IEEE}, 2022.
82
+
83
+ \bibitem{stm}
84
+ Seoung~Wug Oh, Joon{-}Young Lee, Ning Xu, and Seon~Joo Kim.
85
+ \newblock Video object segmentation using space-time memory networks.
86
+ \newblock In {\em {ICCV}}, pages 9225--9234. {IEEE}, 2019.
87
+
88
+ \bibitem{davis}
89
+ Jordi Pont{-}Tuset, Federico Perazzi, Sergi Caelles, Pablo Arbelaez, Alexander
90
+ Sorkine{-}Hornung, and Luc~Van Gool.
91
+ \newblock The 2017 {DAVIS} challenge on video object segmentation.
92
+ \newblock {\em CoRR}, abs/1704.00675, 2017.
93
+
94
+ \bibitem{siammask}
95
+ Qiang Wang, Li~Zhang, Luca Bertinetto, Weiming Hu, and Philip H.~S. Torr.
96
+ \newblock Fast online object tracking and segmentation: {A} unifying approach.
97
+ \newblock In {\em {CVPR}}, pages 1328--1338. Computer Vision Foundation /
98
+ {IEEE}, 2019.
99
+
100
+ \bibitem{aot}
101
+ Zongxin Yang, Yunchao Wei, and Yi~Yang.
102
+ \newblock Associating objects with transformers for video object segmentation.
103
+ \newblock In {\em NeurIPS}, pages 2491--2502, 2021.
104
+
105
+ \end{thebibliography}
overleaf/Track Anything/neurips_2022.bib ADDED
@@ -0,0 +1,187 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ @article{sam,
2
+ title={Segment anything},
3
+ author={Kirillov, Alexander and Mintun, Eric and Ravi, Nikhila and Mao, Hanzi and Rolland, Chloe and Gustafson, Laura and Xiao, Tete and Whitehead, Spencer and Berg, Alexander C and Lo, Wan-Yen and others},
4
+ journal={arXiv preprint arXiv:2304.02643},
5
+ year={2023}
6
+ }
7
+
8
+ @inproceedings{xmem,
9
+ author = {Ho Kei Cheng and
10
+ Alexander G. Schwing},
11
+ title = {XMem: Long-Term Video Object Segmentation with an Atkinson-Shiffrin
12
+ Memory Model},
13
+ booktitle = {{ECCV} {(28)}},
14
+ series = {Lecture Notes in Computer Science},
15
+ volume = {13688},
16
+ pages = {640--658},
17
+ publisher = {Springer},
18
+ year = {2022}
19
+ }
20
+
21
+
22
+ %related
23
+
24
+ @article{vos,
25
+ author = {Mingqi Gao and
26
+ Feng Zheng and
27
+ James J. Q. Yu and
28
+ Caifeng Shan and
29
+ Guiguang Ding and
30
+ Jungong Han},
31
+ title = {Deep learning for video object segmentation: a review},
32
+ journal = {Artif. Intell. Rev.},
33
+ volume = {56},
34
+ number = {1},
35
+ pages = {457--531},
36
+ year = {2023}
37
+ }
38
+
39
+ @inproceedings{vot9,
40
+ title={The ninth visual object tracking vot2021 challenge results},
41
+ author={Kristan, Matej and Matas, Ji{\v{r}}{\'\i} and Leonardis, Ale{\v{s}} and Felsberg, Michael and Pflugfelder, Roman and K{\"a}m{\"a}r{\"a}inen, Joni-Kristian and Chang, Hyung Jin and Danelljan, Martin and Cehovin, Luka and Luke{\v{z}}i{\v{c}}, Alan and others},
42
+ booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},
43
+ pages={2711--2738},
44
+ year={2021}
45
+ }
46
+
47
+ @inproceedings{vot10,
48
+ title={The Tenth Visual Object Tracking VOT2022 Challenge Results},
49
+ author={Kristan, Matej and Leonardis, Ale{\v{s}} and Matas, Ji{\v{r}}{\'\i} and Felsberg, Michael and Pflugfelder, Roman and K{\"a}m{\"a}r{\"a}inen, Joni-Kristian and Chang, Hyung Jin and Danelljan, Martin and Zajc, Luka {\v{C}}ehovin and Luke{\v{z}}i{\v{c}}, Alan and others},
50
+ booktitle={Computer Vision--ECCV 2022 Workshops: Tel Aviv, Israel, October 23--27, 2022, Proceedings, Part VIII},
51
+ pages={431--460},
52
+ year={2023},
53
+ organization={Springer}
54
+ }
55
+
56
+ @inproceedings{vot8,
57
+ title={The eighth visual object tracking VOT2020 challenge results},
58
+ author={Kristan, Matej and Leonardis, Ale{\v{s}} and Matas, Ji{\v{r}}{\'\i} and Felsberg, Michael and Pflugfelder, Roman and K{\"a}m{\"a}r{\"a}inen, Joni-Kristian and Danelljan, Martin and Zajc, Luka {\v{C}}ehovin and Luke{\v{z}}i{\v{c}}, Alan and Drbohlav, Ondrej and others},
59
+ booktitle={European Conference on Computer Vision},
60
+ pages={547--601},
61
+ year={2020},
62
+ organization={Springer}
63
+ }
64
+ @inproceedings{vot7,
65
+ title={The seventh visual object tracking vot2019 challenge results},
66
+ author={Kristan, Matej and Matas, Jiri and Leonardis, Ales and Felsberg, Michael and Pflugfelder, Roman and Kamarainen, Joni-Kristian and ˇCehovin Zajc, Luka and Drbohlav, Ondrej and Lukezic, Alan and Berg, Amanda and others},
67
+ booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision Workshops},
68
+ pages={0--0},
69
+ year={2019}
70
+ }
71
+ @inproceedings{vot6,
72
+ title={The sixth visual object tracking vot2018 challenge results},
73
+ author={Kristan, Matej and Leonardis, Ales and Matas, Jiri and Felsberg, Michael and Pflugfelder, Roman and ˇCehovin Zajc, Luka and Vojir, Tomas and Bhat, Goutam and Lukezic, Alan and Eldesokey, Abdelrahman and others},
74
+ booktitle={Proceedings of the European Conference on Computer Vision (ECCV) Workshops},
75
+ pages={0--0},
76
+ year={2018}
77
+ }
78
+
79
+ @inproceedings{vit,
80
+ author = {Alexey Dosovitskiy and
81
+ Lucas Beyer and
82
+ Alexander Kolesnikov and
83
+ Dirk Weissenborn and
84
+ Xiaohua Zhai and
85
+ Thomas Unterthiner and
86
+ Mostafa Dehghani and
87
+ Matthias Minderer and
88
+ Georg Heigold and
89
+ Sylvain Gelly and
90
+ Jakob Uszkoreit and
91
+ Neil Houlsby},
92
+ title = {An Image is Worth 16x16 Words: Transformers for Image Recognition
93
+ at Scale},
94
+ booktitle = {{ICLR}},
95
+ publisher = {OpenReview.net},
96
+ year = {2021}
97
+ }
98
+
99
+ @inproceedings{stm,
100
+ author = {Seoung Wug Oh and
101
+ Joon{-}Young Lee and
102
+ Ning Xu and
103
+ Seon Joo Kim},
104
+ title = {Video Object Segmentation Using Space-Time Memory Networks},
105
+ booktitle = {{ICCV}},
106
+ pages = {9225--9234},
107
+ publisher = {{IEEE}},
108
+ year = {2019}
109
+ }
110
+
111
+ @inproceedings{siammask,
112
+ author = {Qiang Wang and
113
+ Li Zhang and
114
+ Luca Bertinetto and
115
+ Weiming Hu and
116
+ Philip H. S. Torr},
117
+ title = {Fast Online Object Tracking and Segmentation: {A} Unifying Approach},
118
+ booktitle = {{CVPR}},
119
+ pages = {1328--1338},
120
+ publisher = {Computer Vision Foundation / {IEEE}},
121
+ year = {2019}
122
+ }
123
+
124
+ @inproceedings{mivos,
125
+ author = {Ho Kei Cheng and
126
+ Yu{-}Wing Tai and
127
+ Chi{-}Keung Tang},
128
+ title = {Modular Interactive Video Object Segmentation: Interaction-to-Mask,
129
+ Propagation and Difference-Aware Fusion},
130
+ booktitle = {{CVPR}},
131
+ pages = {5559--5568},
132
+ publisher = {Computer Vision Foundation / {IEEE}},
133
+ year = {2021}
134
+ }
135
+
136
+ @article{davis,
137
+ author = {Jordi Pont{-}Tuset and
138
+ Federico Perazzi and
139
+ Sergi Caelles and
140
+ Pablo Arbelaez and
141
+ Alexander Sorkine{-}Hornung and
142
+ Luc Van Gool},
143
+ title = {The 2017 {DAVIS} Challenge on Video Object Segmentation},
144
+ journal = {CoRR},
145
+ volume = {abs/1704.00675},
146
+ year = {2017}
147
+ }
148
+
149
+ @inproceedings{aot,
150
+ author = {Zongxin Yang and
151
+ Yunchao Wei and
152
+ Yi Yang},
153
+ title = {Associating Objects with Transformers for Video Object Segmentation},
154
+ booktitle = {NeurIPS},
155
+ pages = {2491--2502},
156
+ year = {2021}
157
+ }
158
+
159
+ @inproceedings{icip,
160
+ author = {St{\'{e}}phane Vujasinovic and
161
+ Sebastian Bullinger and
162
+ Stefan Becker and
163
+ Norbert Scherer{-}Negenborn and
164
+ Michael Arens and
165
+ Rainer Stiefelhagen},
166
+ title = {Revisiting Click-Based Interactive Video Object Segmentation},
167
+ booktitle = {{ICIP}},
168
+ pages = {2756--2760},
169
+ publisher = {{IEEE}},
170
+ year = {2022}
171
+ }
172
+
173
+
174
+
175
+
176
+ @inproceedings{e2fgvi,
177
+ author = {Zhen Li and
178
+ Chengze Lu and
179
+ Jianhua Qin and
180
+ Chun{-}Le Guo and
181
+ Ming{-}Ming Cheng},
182
+ title = {Towards An End-to-End Framework for Flow-Guided Video Inpainting},
183
+ booktitle = {{CVPR}},
184
+ pages = {17541--17550},
185
+ publisher = {{IEEE}},
186
+ year = {2022}
187
+ }
overleaf/Track Anything/neurips_2022.sty ADDED
@@ -0,0 +1,381 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ % partial rewrite of the LaTeX2e package for submissions to the
2
+ % Conference on Neural Information Processing Systems (NeurIPS):
3
+ %
4
+ % - uses more LaTeX conventions
5
+ % - line numbers at submission time replaced with aligned numbers from
6
+ % lineno package
7
+ % - \nipsfinalcopy replaced with [final] package option
8
+ % - automatically loads times package for authors
9
+ % - loads natbib automatically; this can be suppressed with the
10
+ % [nonatbib] package option
11
+ % - adds foot line to first page identifying the conference
12
+ % - adds preprint option for submission to e.g. arXiv
13
+ % - conference acronym modified
14
+ %
15
+ % Roman Garnett (garnett@wustl.edu) and the many authors of
16
+ % nips15submit_e.sty, including MK and drstrip@sandia
17
+ %
18
+ % last revision: March 2022
19
+
20
+ \NeedsTeXFormat{LaTeX2e}
21
+ \ProvidesPackage{neurips_2022}[2022/03/31 NeurIPS 2022 submission/camera-ready style file]
22
+
23
+ % declare final option, which creates camera-ready copy
24
+ \newif\if@neuripsfinal\@neuripsfinalfalse
25
+ \DeclareOption{final}{
26
+ \@neuripsfinaltrue
27
+ }
28
+
29
+ % declare nonatbib option, which does not load natbib in case of
30
+ % package clash (users can pass options to natbib via
31
+ % \PassOptionsToPackage)
32
+ \newif\if@natbib\@natbibtrue
33
+ \DeclareOption{nonatbib}{
34
+ \@natbibfalse
35
+ }
36
+
37
+ % declare preprint option, which creates a preprint version ready for
38
+ % upload to, e.g., arXiv
39
+ \newif\if@preprint\@preprintfalse
40
+ \DeclareOption{preprint}{
41
+ \@preprinttrue
42
+ }
43
+
44
+ \ProcessOptions\relax
45
+
46
+ % determine whether this is an anonymized submission
47
+ \newif\if@submission\@submissiontrue
48
+ \if@neuripsfinal\@submissionfalse\fi
49
+ \if@preprint\@submissionfalse\fi
50
+
51
+ % fonts
52
+ \renewcommand{\rmdefault}{ptm}
53
+ \renewcommand{\sfdefault}{phv}
54
+
55
+ % change this every year for notice string at bottom
56
+ \newcommand{\@neuripsordinal}{36th}
57
+ \newcommand{\@neuripsyear}{2022}
58
+ \newcommand{\@neuripslocation}{New Orleans}
59
+
60
+ % acknowledgments
61
+ \usepackage{environ}
62
+ \newcommand{\acksection}{\section*{Acknowledgments and Disclosure of Funding}}
63
+ \NewEnviron{ack}{%
64
+ \acksection
65
+ \BODY
66
+ }
67
+
68
+
69
+ % load natbib unless told otherwise
70
+ %\if@natbib
71
+ % \RequirePackage{natbib}
72
+ %\fi
73
+
74
+ % set page geometry
75
+ \usepackage[verbose=true,letterpaper]{geometry}
76
+ \AtBeginDocument{
77
+ \newgeometry{
78
+ textheight=9in,
79
+ textwidth=5.5in,
80
+ top=1in,
81
+ headheight=12pt,
82
+ headsep=25pt,
83
+ footskip=30pt
84
+ }
85
+ \@ifpackageloaded{fullpage}
86
+ {\PackageWarning{neurips_2022}{fullpage package not allowed! Overwriting formatting.}}
87
+ {}
88
+ }
89
+
90
+ \widowpenalty=10000
91
+ \clubpenalty=10000
92
+ \flushbottom
93
+ \sloppy
94
+
95
+
96
+ % font sizes with reduced leading
97
+ \renewcommand{\normalsize}{%
98
+ \@setfontsize\normalsize\@xpt\@xipt
99
+ \abovedisplayskip 7\p@ \@plus 2\p@ \@minus 5\p@
100
+ \abovedisplayshortskip \z@ \@plus 3\p@
101
+ \belowdisplayskip \abovedisplayskip
102
+ \belowdisplayshortskip 4\p@ \@plus 3\p@ \@minus 3\p@
103
+ }
104
+ \normalsize
105
+ \renewcommand{\small}{%
106
+ \@setfontsize\small\@ixpt\@xpt
107
+ \abovedisplayskip 6\p@ \@plus 1.5\p@ \@minus 4\p@
108
+ \abovedisplayshortskip \z@ \@plus 2\p@
109
+ \belowdisplayskip \abovedisplayskip
110
+ \belowdisplayshortskip 3\p@ \@plus 2\p@ \@minus 2\p@
111
+ }
112
+ \renewcommand{\footnotesize}{\@setfontsize\footnotesize\@ixpt\@xpt}
113
+ \renewcommand{\scriptsize}{\@setfontsize\scriptsize\@viipt\@viiipt}
114
+ \renewcommand{\tiny}{\@setfontsize\tiny\@vipt\@viipt}
115
+ \renewcommand{\large}{\@setfontsize\large\@xiipt{14}}
116
+ \renewcommand{\Large}{\@setfontsize\Large\@xivpt{16}}
117
+ \renewcommand{\LARGE}{\@setfontsize\LARGE\@xviipt{20}}
118
+ \renewcommand{\huge}{\@setfontsize\huge\@xxpt{23}}
119
+ \renewcommand{\Huge}{\@setfontsize\Huge\@xxvpt{28}}
120
+
121
+ % sections with less space
122
+ \providecommand{\section}{}
123
+ \renewcommand{\section}{%
124
+ \@startsection{section}{1}{\z@}%
125
+ {-2.0ex \@plus -0.5ex \@minus -0.2ex}%
126
+ { 1.5ex \@plus 0.3ex \@minus 0.2ex}%
127
+ {\large\bf\raggedright}%
128
+ }
129
+ \providecommand{\subsection}{}
130
+ \renewcommand{\subsection}{%
131
+ \@startsection{subsection}{2}{\z@}%
132
+ {-1.8ex \@plus -0.5ex \@minus -0.2ex}%
133
+ { 0.8ex \@plus 0.2ex}%
134
+ {\normalsize\bf\raggedright}%
135
+ }
136
+ \providecommand{\subsubsection}{}
137
+ \renewcommand{\subsubsection}{%
138
+ \@startsection{subsubsection}{3}{\z@}%
139
+ {-1.5ex \@plus -0.5ex \@minus -0.2ex}%
140
+ { 0.5ex \@plus 0.2ex}%
141
+ {\normalsize\bf\raggedright}%
142
+ }
143
+ \providecommand{\paragraph}{}
144
+ \renewcommand{\paragraph}{%
145
+ \@startsection{paragraph}{4}{\z@}%
146
+ {1.5ex \@plus 0.5ex \@minus 0.2ex}%
147
+ {-1em}%
148
+ {\normalsize\bf}%
149
+ }
150
+ \providecommand{\subparagraph}{}
151
+ \renewcommand{\subparagraph}{%
152
+ \@startsection{subparagraph}{5}{\z@}%
153
+ {1.5ex \@plus 0.5ex \@minus 0.2ex}%
154
+ {-1em}%
155
+ {\normalsize\bf}%
156
+ }
157
+ \providecommand{\subsubsubsection}{}
158
+ \renewcommand{\subsubsubsection}{%
159
+ \vskip5pt{\noindent\normalsize\rm\raggedright}%
160
+ }
161
+
162
+ % float placement
163
+ \renewcommand{\topfraction }{0.85}
164
+ \renewcommand{\bottomfraction }{0.4}
165
+ \renewcommand{\textfraction }{0.1}
166
+ \renewcommand{\floatpagefraction}{0.7}
167
+
168
+ \newlength{\@neuripsabovecaptionskip}\setlength{\@neuripsabovecaptionskip}{7\p@}
169
+ \newlength{\@neuripsbelowcaptionskip}\setlength{\@neuripsbelowcaptionskip}{\z@}
170
+
171
+ \setlength{\abovecaptionskip}{\@neuripsabovecaptionskip}
172
+ \setlength{\belowcaptionskip}{\@neuripsbelowcaptionskip}
173
+
174
+ % swap above/belowcaptionskip lengths for tables
175
+ \renewenvironment{table}
176
+ {\setlength{\abovecaptionskip}{\@neuripsbelowcaptionskip}%
177
+ \setlength{\belowcaptionskip}{\@neuripsabovecaptionskip}%
178
+ \@float{table}}
179
+ {\end@float}
180
+
181
+ % footnote formatting
182
+ \setlength{\footnotesep }{6.65\p@}
183
+ \setlength{\skip\footins}{9\p@ \@plus 4\p@ \@minus 2\p@}
184
+ \renewcommand{\footnoterule}{\kern-3\p@ \hrule width 12pc \kern 2.6\p@}
185
+ \setcounter{footnote}{0}
186
+
187
+ % paragraph formatting
188
+ \setlength{\parindent}{\z@}
189
+ \setlength{\parskip }{5.5\p@}
190
+
191
+ % list formatting
192
+ \setlength{\topsep }{4\p@ \@plus 1\p@ \@minus 2\p@}
193
+ \setlength{\partopsep }{1\p@ \@plus 0.5\p@ \@minus 0.5\p@}
194
+ \setlength{\itemsep }{2\p@ \@plus 1\p@ \@minus 0.5\p@}
195
+ \setlength{\parsep }{2\p@ \@plus 1\p@ \@minus 0.5\p@}
196
+ \setlength{\leftmargin }{3pc}
197
+ \setlength{\leftmargini }{\leftmargin}
198
+ \setlength{\leftmarginii }{2em}
199
+ \setlength{\leftmarginiii}{1.5em}
200
+ \setlength{\leftmarginiv }{1.0em}
201
+ \setlength{\leftmarginv }{0.5em}
202
+ \def\@listi {\leftmargin\leftmargini}
203
+ \def\@listii {\leftmargin\leftmarginii
204
+ \labelwidth\leftmarginii
205
+ \advance\labelwidth-\labelsep
206
+ \topsep 2\p@ \@plus 1\p@ \@minus 0.5\p@
207
+ \parsep 1\p@ \@plus 0.5\p@ \@minus 0.5\p@
208
+ \itemsep \parsep}
209
+ \def\@listiii{\leftmargin\leftmarginiii
210
+ \labelwidth\leftmarginiii
211
+ \advance\labelwidth-\labelsep
212
+ \topsep 1\p@ \@plus 0.5\p@ \@minus 0.5\p@
213
+ \parsep \z@
214
+ \partopsep 0.5\p@ \@plus 0\p@ \@minus 0.5\p@
215
+ \itemsep \topsep}
216
+ \def\@listiv {\leftmargin\leftmarginiv
217
+ \labelwidth\leftmarginiv
218
+ \advance\labelwidth-\labelsep}
219
+ \def\@listv {\leftmargin\leftmarginv
220
+ \labelwidth\leftmarginv
221
+ \advance\labelwidth-\labelsep}
222
+ \def\@listvi {\leftmargin\leftmarginvi
223
+ \labelwidth\leftmarginvi
224
+ \advance\labelwidth-\labelsep}
225
+
226
+ % create title
227
+ \providecommand{\maketitle}{}
228
+ \renewcommand{\maketitle}{%
229
+ \par
230
+ \begingroup
231
+ \renewcommand{\thefootnote}{\fnsymbol{footnote}}
232
+ % for perfect author name centering
233
+ \renewcommand{\@makefnmark}{\hbox to \z@{$^{\@thefnmark}$\hss}}
234
+ % The footnote-mark was overlapping the footnote-text,
235
+ % added the following to fix this problem (MK)
236
+ \long\def\@makefntext##1{%
237
+ \parindent 1em\noindent
238
+ \hbox to 1.8em{\hss $\m@th ^{\@thefnmark}$}##1
239
+ }
240
+ \thispagestyle{empty}
241
+ \@maketitle
242
+ \@thanks
243
+ \@notice
244
+ \endgroup
245
+ \let\maketitle\relax
246
+ \let\thanks\relax
247
+ }
248
+
249
+ % rules for title box at top of first page
250
+ \newcommand{\@toptitlebar}{
251
+ \hrule height 4\p@
252
+ \vskip 0.25in
253
+ \vskip -\parskip%
254
+ }
255
+ \newcommand{\@bottomtitlebar}{
256
+ \vskip 0.29in
257
+ \vskip -\parskip
258
+ \hrule height 1\p@
259
+ \vskip 0.09in%
260
+ }
261
+
262
+ % create title (includes both anonymized and non-anonymized versions)
263
+ \providecommand{\@maketitle}{}
264
+ \renewcommand{\@maketitle}{%
265
+ \vbox{%
266
+ \hsize\textwidth
267
+ \linewidth\hsize
268
+ \vskip 0.1in
269
+ \@toptitlebar
270
+ \centering
271
+ {\LARGE\bf \@title\par}
272
+ \@bottomtitlebar
273
+ \if@submission
274
+ \begin{tabular}[t]{c}\bf\rule{\z@}{24\p@}
275
+ Anonymous Author(s) \\
276
+ Affiliation \\
277
+ Address \\
278
+ \texttt{email} \\
279
+ \end{tabular}%
280
+ \else
281
+ \def\And{%
282
+ \end{tabular}\hfil\linebreak[0]\hfil%
283
+ \begin{tabular}[t]{c}\bf\rule{\z@}{24\p@}\ignorespaces%
284
+ }
285
+ \def\AND{%
286
+ \end{tabular}\hfil\linebreak[4]\hfil%
287
+ \begin{tabular}[t]{c}\bf\rule{\z@}{24\p@}\ignorespaces%
288
+ }
289
+ \begin{tabular}[t]{c}\bf\rule{\z@}{24\p@}\@author\end{tabular}%
290
+ \fi
291
+ \vskip 0.3in \@minus 0.1in
292
+ }
293
+ }
294
+
295
+ % add conference notice to bottom of first page
296
+ \newcommand{\ftype@noticebox}{8}
297
+ \newcommand{\@notice}{%
298
+ % give a bit of extra room back to authors on first page
299
+ \enlargethispage{2\baselineskip}%
300
+ \@float{noticebox}[b]%
301
+ \footnotesize\@noticestring%
302
+ \end@float%
303
+ }
304
+
305
+ % abstract styling
306
+ \renewenvironment{abstract}%
307
+ {%
308
+ \vskip 0.075in%
309
+ \centerline%
310
+ {\large\bf Abstract}%
311
+ \vspace{0.5ex}%
312
+ \begin{quote}%
313
+ }
314
+ {
315
+ \par%
316
+ \end{quote}%
317
+ \vskip 1ex%
318
+ }
319
+
320
+ % For the paper checklist
321
+ \newcommand{\answerYes}[1][]{\textcolor{blue}{[Yes] #1}}
322
+ \newcommand{\answerNo}[1][]{\textcolor{orange}{[No] #1}}
323
+ \newcommand{\answerNA}[1][]{\textcolor{gray}{[N/A] #1}}
324
+ \newcommand{\answerTODO}[1][]{\textcolor{red}{\bf [TODO]}}
325
+
326
+ % handle tweaks for camera-ready copy vs. submission copy
327
+ \if@preprint
328
+ \newcommand{\@noticestring}{%
329
+ Preprint. Under review.%
330
+ }
331
+ \else
332
+ \if@neuripsfinal
333
+ \newcommand{\@noticestring}{%
334
+ \@neuripsordinal\/ Conference on Neural Information Processing Systems
335
+ (NeurIPS \@neuripsyear).%, \@neuripslocation.%
336
+ }
337
+ \else
338
+ \newcommand{\@noticestring}{%
339
+ Submitted to \@neuripsordinal\/ Conference on Neural Information
340
+ Processing Systems (NeurIPS \@neuripsyear). Do not distribute.%
341
+ }
342
+
343
+ % hide the acknowledgements
344
+ \NewEnviron{hide}{}
345
+ \let\ack\hide
346
+ \let\endack\endhide
347
+
348
+ % line numbers for submission
349
+ \RequirePackage{lineno}
350
+ \linenumbers
351
+
352
+ % fix incompatibilities between lineno and amsmath, if required, by
353
+ % transparently wrapping linenomath environments around amsmath
354
+ % environments
355
+ \AtBeginDocument{%
356
+ \@ifpackageloaded{amsmath}{%
357
+ \newcommand*\patchAmsMathEnvironmentForLineno[1]{%
358
+ \expandafter\let\csname old#1\expandafter\endcsname\csname #1\endcsname
359
+ \expandafter\let\csname oldend#1\expandafter\endcsname\csname end#1\endcsname
360
+ \renewenvironment{#1}%
361
+ {\linenomath\csname old#1\endcsname}%
362
+ {\csname oldend#1\endcsname\endlinenomath}%
363
+ }%
364
+ \newcommand*\patchBothAmsMathEnvironmentsForLineno[1]{%
365
+ \patchAmsMathEnvironmentForLineno{#1}%
366
+ \patchAmsMathEnvironmentForLineno{#1*}%
367
+ }%
368
+ \patchBothAmsMathEnvironmentsForLineno{equation}%
369
+ \patchBothAmsMathEnvironmentsForLineno{align}%
370
+ \patchBothAmsMathEnvironmentsForLineno{flalign}%
371
+ \patchBothAmsMathEnvironmentsForLineno{alignat}%
372
+ \patchBothAmsMathEnvironmentsForLineno{gather}%
373
+ \patchBothAmsMathEnvironmentsForLineno{multline}%
374
+ }
375
+ {}
376
+ }
377
+ \fi
378
+ \fi
379
+
380
+
381
+ \endinput
overleaf/Track Anything/neurips_2022.tex ADDED
@@ -0,0 +1,378 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ \documentclass{article}
2
+
3
+
4
+ % if you need to pass options to natbib, use, e.g.:
5
+ % \PassOptionsToPackage{numbers, compress}{natbib}
6
+ % before loading neurips_2022
7
+
8
+
9
+ % ready for submission
10
+ % \usepackage{neurips_2022}
11
+
12
+
13
+ % to compile a preprint version, e.g., for submission to arXiv, add add the
14
+ % [preprint] option:
15
+ \usepackage[preprint]{neurips_2022}
16
+
17
+ % to compile a camera-ready version, add the [final] option, e.g.:
18
+ % \usepackage[final]{neurips_2022}
19
+
20
+
21
+ % to avoid loading the natbib package, add option nonatbib:
22
+ % \usepackage[nonatbib]{neurips_2022}
23
+ \usepackage{graphicx}
24
+ \usepackage[utf8]{inputenc} % allow utf-8 input
25
+ \usepackage[T1]{fontenc} % use 8-bit T1 fonts
26
+ \usepackage{hyperref} % hyperlinks
27
+ \usepackage{url} % simple URL typesetting
28
+ \usepackage{booktabs} % professional-quality tables
29
+ \usepackage{amsfonts} % blackboard math symbols
30
+ \usepackage{nicefrac} % compact symbols for 1/2, etc.
31
+ \usepackage{microtype} % microtypography
32
+ \usepackage{xcolor} % colors
33
+ % \usepackage{acmart}
34
+
35
+ \title{Track Anything: High-performance Interactive Tracking and Segmentation}
36
+ \title{Track Anything: High-performance Object Tracking in Videos by Interactive Masks}
37
+ % \title{Track Anything: Interaction to Mask in Videos}
38
+ \title{Track Anything: Segment Anything Meets Videos}
39
+
40
+ % \author{%
41
+ % David S.~Hippocampus\thanks{Use footnote for providing further information
42
+ % about author (webpage, alternative address)---\emph{not} for acknowledging
43
+ % funding agencies.} \\
44
+ % SUSTech VIPG\\
45
+
46
+ % \author{Jinyu Yang}
47
+ % \authornote{equal}
48
+
49
+ % \author{Mingqi Gao}
50
+ % \authornotemark[1]
51
+
52
+ \author{%
53
+ Jinyu Yang\thanks{Equal contribution. Alphabetical order.},\enskip Mingqi Gao\footnotemark[1],\enskip Zhe Li\footnotemark[1],\enskip Shang Gao, Fangjing Wang, Feng Zheng \\
54
+ SUSTech VIP Lab\\
55
+ % Cranberry-Lemon University\\
56
+ % Pittsburgh, PA 15213 \\
57
+ % \texttt{hippo@cs.cranberry-lemon.edu} \\
58
+ % \url{https://github.com/gaomingqi/Track-Anything}\\
59
+ % examples of more authors
60
+ % \And
61
+ % Coauthor \\
62
+ % Affiliation \\
63
+ % Address \\
64
+ % \texttt{email} \\
65
+ % \AND
66
+ % Coauthor \\
67
+ % Affiliation \\
68
+ % Address \\
69
+ % \texttt{email} \\
70
+ % \And
71
+ % Coauthor \\
72
+ % Affiliation \\
73
+ % Address \\
74
+ % \texttt{email} \\
75
+ % \And
76
+ % Coauthor \\
77
+ % Affiliation \\
78
+ % Address \\
79
+ % \texttt{email} \\
80
+ % \thanks{these authors contributed equally}
81
+ }
82
+ % \affiliation{\institution{SUSTech VIP Lab}}
83
+ % \footnote{Equal contribution. Alphabetical order.}
84
+
85
+ \begin{document}
86
+
87
+
88
+ \maketitle
89
+
90
+
91
+ \begin{abstract}
92
+
93
+ Recently, the Segment Anything Model (SAM) gains lots of attention rapidly due to its impressive segmentation performance on images.
94
+ Regarding its strong ability on image segmentation and high interactivity with different prompts, we found that it performs poorly on consistent segmentation in videos.
95
+ Therefore, in this report, we propose Track Anything Model (TAM), which achieves high-performance interactive tracking and segmentation in videos.
96
+ To be detailed, given a video sequence, only with very little human participation, \textit{i.e.}, several clicks, people can track anything they are interested in, and get satisfactory results in one-pass inference.
97
+ Without additional training, such an interactive design performs impressively on video object tracking and segmentation.
98
+ % superior to prior works on video object tracking and segmentation.
99
+ All resources are available on \url{https://github.com/gaomingqi/Track-Anything}.
100
+ We hope this work can facilitate related research.
101
+
102
+ \end{abstract}
103
+
104
+ \section{Introduction}
105
+
106
+ Tracking an arbitrary object in generic scenes is important, and Video Object Tracking (VOT) is a fundamental task in computer vision.
107
+ Similar to VOT, Video Object Segmentation (VOS) aims to separate the target (region of interest) from the background in a video sequence, which can be seen as a kind of more fine-grained object tracking.
108
+ We notice that current state-of-the-art video trackers/segmenters are trained on large-scale manually-annotated datasets and initialized by a bounding box or a segmentation mask.
109
+ On the one hand, the massive human labor force is hidden behind huge amounts of labeled data.
110
+ % Recently, interactive algorithms help to liberate users from labor-expensive initialization and annotation.
111
+ Moreover, current initialization settings, especially the semi-supervised VOS, need specific object mask groundtruth for model initialization.
112
+ How to liberate researchers from labor-expensive annotation and initialization is much of important.
113
+
114
+
115
+ Recently, Segment-Anything Model (SAM)~\cite{sam} has been proposed, which is a large foundation model for image segmentation.
116
+ It supports flexible prompts and computes masks in real-time, thus allowing interactive use.
117
+ We conclude that SAM has the following advantages that can assist interactive tracking:
118
+ \textbf{1) Strong image segmentation ability.}
119
+ Trained on 11 million images and 1.1 billion masks, SAM can produce high-quality masks and do zero-shot segmentation in generic scenarios.
120
+ \textbf{2) High interactivity with different kinds of prompts. }
121
+ With input user-friendly prompts of points, boxes, or language, SAM can give satisfactory segmentation masks on specific image areas.
122
+ However, using SAM in videos directly did not give us an impressive performance due to its deficiency in temporal correspondence.
123
+
124
+ On the other hand, tracking or segmenting in videos faces challenges from scale variation, target deformation, motion blur, camera motion, similar objects, and so on~\cite{vos,vot6,vot7,vot8,vot9,vot10}.
125
+ Even the state-of-the-art models suffer from complex scenarios in the public datasets~\cite{xmem}, not to mention the real-world applications.
126
+ Therefore, a question is considered by us:
127
+ \textit{can we achieve high-performance tracking/segmentation in videos through the way of interaction?}
128
+
129
+ In this technical report, we introduce our Track-Anything project, which develops an efficient toolkit for high-performance object tracking and segmentation in videos.
130
+ With a user-friendly interface, the Track Anything Model (TAM) can track and segment any objects in a given video with only one-pass inference.
131
+ Figure~\ref{fig:overview} shows the one-pass interactive process in the proposed TAM.
132
+ In detail, TAM combines SAM~\cite{sam}, a large segmentation model, and XMem~\cite{xmem}, an advanced VOS model.
133
+ As shown, we integrate them in an interactive way.
134
+ Firstly, users can interactively initialize the SAM, \textit{i.e.}, clicking on the object, to define a target object;
135
+ then, XMem is used to give a mask prediction of the object in the next frame according to both temporal and spatial correspondence;
136
+ next, SAM is utilized to give a more precise mask description;
137
+ during the tracking process, users can pause and correct as soon as they notice tracking failures.
138
+
139
+ Our contributions can be concluded as follows:
140
+
141
+ 1) We promote the SAM applications to the video level to achieve interactive video object tracking and segmentation.
142
+ % We combine the SAM with VOS models to achieve interactive video object tracking and segmentation.
143
+ Rather than separately using SAM per frame, we integrate SAM into the process of temporal correspondence construction.
144
+
145
+ 2) We propose one-pass interactive tracking and segmentation for efficient annotation and a user-friendly tracking interface, which uses very small amounts of human participation to solve extreme difficulties in video object perception.
146
+
147
+ 3) Our proposed method shows superior performance and high usability in complex scenes and has many potential applications.
148
+
149
+ % \section{Related Works}
150
+
151
+ % \textbf{Video Object Tracking.}
152
+
153
+
154
+
155
+ % \textbf{Video Object Segmentation.}
156
+ \section{Track Anything Task}
157
+
158
+ Inspired by the Segment Anything task~\cite{sam}, we propose the Track Anything task, which aims to flexible object tracking in arbitrary videos.
159
+ Here we define that the target objects can be flexibly selected, added, or removed in any way according to the users' interests.
160
+ Also, the video length and types can be arbitrary rather than limited to trimmed or natural videos.
161
+ With such settings, diverse downstream tasks can be achieved, including single/multiple object tracking, short-/long-term object tracking, unsupervised VOS, semi-supervised VOS, referring VOS, interactive VOS, long-term VOS, and so on.
162
+
163
+ \section{Methodology}
164
+
165
+ \subsection{Preliminaries}
166
+
167
+ \textbf{Segment Anything Model~\cite{sam}.}
168
+ Very recently, the Segment Anything Model (SAM) has been proposed by Meta AI Research and gets numerous attention.
169
+ As a foundation model for image segmentation, SAM is based on ViT~\cite{vit} and trained on the large-scale dataset SA-1B~\cite{sam}.
170
+ Obviously, SAM shows promising segmentation ability on images, especially on zero-shot segmentation tasks.
171
+ Unfortunately, SAM only shows superior performance on image segmentation, while it cannot deal with complex video segmentation.
172
+
173
+
174
+ \textbf{XMem~\cite{xmem}.}
175
+ Given the mask description of the target object at the first frame, XMem can track the object and generate corresponding masks in the subsequent frames.
176
+ Inspired by the Atkinson-Shiffrin memory model, it aims to solve the difficulties in long-term videos with unified feature memory stores.
177
+ The drawbacks of XMem are also obvious: 1) as a semi-supervised VOS model, it requires a precise mask to initialize; 2) for long videos, it is difficult for XMem to recover from tracking or segmentation failure.
178
+ In this paper, we solve both difficulties by importing interactive tracking with SAM.
179
+
180
+
181
+ \textbf{Interactive Video Object Segmentation.}
182
+ Interactive VOS~\cite{mivos} takes user interactions as inputs, \textit{e.g.}, scribbles.
183
+ Then, users can iteratively refine the segmentation results until they are satisfied with them.
184
+ Interactive VOS gains lots of attention as it is much easier to provide scribbles than to specify every pixel for an object mask.
185
+ However, we found that current interactive VOS methods require multiple rounds to refine the results, which impedes their efficiency in real-world applications.
186
+
187
+ \begin{figure}[t]
188
+ \centering
189
+ \includegraphics[width=\linewidth]{figs/overview_4.pdf}
190
+ \caption{Pipeline of our proposed Track Anything Model (TAM). Only within one round of inference can the TAM obtain impressive tracking and segmentation performance on the human-selected target.}
191
+ \label{fig:overview}
192
+ \end{figure}
193
+
194
+ \begin{table}
195
+ \caption{Results on DAVIS-2016-val and DAVIS-2017-test-dev datasets~\cite{davis}.}
196
+ \label{davis1617}
197
+ \centering
198
+ \small
199
+ \setlength\tabcolsep{4pt}
200
+ \begin{tabular}{l|c|c|c|ccc|ccc}
201
+ \toprule
202
+ & & & &\multicolumn{3}{c|}{DAVIS-2016-val} &\multicolumn{3}{c}{DAVIS-2017-test-dev} \\
203
+ Method & Venue & Initialization & Evaluation& $J\&F$ & $J$ &$F$ &$J\&F$ & $J$ &$F$\\
204
+ \midrule
205
+ STM~\cite{stm} & ICCV2019 &Mask & One Pass &89.3 &88.7 &89.9 & 72.2 & 69.3 & 75.2 \\
206
+ AOT~\cite{aot} &NeurIPS2021 &Mask & One Pass & 91.1 & 90.1 & 92.1 & 79.6 & 75.9 & 83.3 \\
207
+ XMem~\cite{xmem} & NeurIPS2022 &Mask & One Pass & 92.0 &90.7 &93.2 & 81.2 & 77.6 & 84.7\\
208
+ \midrule
209
+ % SiamMask~\cite{siammask}& CVPR2019 &Box & One Pass & 69.8 &71.7 &67.8 &56.4 &54.3 &58.5 \\
210
+ SiamMask~\cite{siammask}& CVPR2019 &Box & One Pass & 69.8 &71.7 &67.8 &- &- &- \\
211
+ \midrule
212
+ % MiVOS~\cite{mivos} & CVPR2021 &Scribble &8 Rounds &91.0 &89.6 &92.4 & 84.5 &81.7 &87.4\\
213
+ MiVOS~\cite{mivos} & CVPR2021 &Scribble &8 Rounds &91.0 &89.6 &92.4 &78.6 &74.9 &82.2\\
214
+ % \midrule
215
+ % & ICIP2022 &Click & \\
216
+ \midrule
217
+ TAM (Proposed) &- & Click & One Pass & 88.4 & 87.5 &89.4 & 73.1 & 69.8 & 76.4\\
218
+ % Ours & & 5 Clicks & \\
219
+ \bottomrule
220
+ \end{tabular}
221
+ \end{table}
222
+
223
+
224
+
225
+ \subsection{Implementation}\label{implementation}
226
+
227
+ Inspired by SAM, we consider tracking anything in videos.
228
+ We aim to define this task with high interactivity and ease of use.
229
+ It leads to ease of use and is able to obtain high performance with very little human interaction effort.
230
+ Figure~\ref{fig:overview} shows the pipeline of our Track Anything Model (TAM).
231
+ As shown, we divide our Track-Anything process into the following four steps:
232
+
233
+ \textbf{Step 1: Initialization with SAM~\cite{sam}.}
234
+ As SAM provides us an opportunity to segment a region of interest with weak prompts, \textit{e.g.}, points, and bounding boxes, we use it to give an initial mask of the target object.
235
+ Following SAM, users can get a mask description of the interested object by a click or modify the object mask with several clicks to get a satisfactory initialization.
236
+
237
+ \textbf{Step 2: Tracking with XMem~\cite{xmem}.}
238
+ Given the initialized mask, XMem performs semi-supervised VOS on the following frames.
239
+ Since XMem is an advanced VOS method that can output satisfactory results on simple scenarios, we output the predicted masks of XMem on most occasions.
240
+ When the mask quality is not such good, we save the XMem predictions and corresponding intermediate parameters, \textit{i.e.}, probes and affinities, and skip to step 3.
241
+ % Given the initialized mask and the whole sequence, XMem performs semi-supervised VOS, which aims to solve the performance decay in long-term prediction with memory potentiation.
242
+
243
+
244
+ \textbf{Step 3: Refinement with SAM~\cite{sam}.}
245
+ We notice that during the inference of VOS models, keep predicting consistent and precise masks are challenging.
246
+ In fact, most state-of-the-art VOS models tend to segment more and more coarsely over time during inference.
247
+ Therefore, we utilize SAM to refine the masks predicted by XMem when its quality assessment is not satisfactory.
248
+ Specifically, we project the probes and affinities to be point prompts for SAM, and the predicted mask from Step 2 is used as a mask prompt for SAM.
249
+ Then, with these prompts, SAM is able to produce a refined segmentation mask.
250
+ Such refined masks will also be added to the temporal correspondence of XMem to refine all subsequent object discrimination.
251
+
252
+ \textbf{Step 4: Correction with human participation.}
253
+ % Long video annotation.
254
+ After the above three steps, the TAM can now successfully solve some common challenges and predict segmentation masks.
255
+ However, we notice that it is still difficult to accurately distinguish the objects in some extremely challenging scenarios, especially when processing long videos.
256
+ Therefore, we propose to add human correction during inference, which can bring a qualitative leap in performance with only very small human efforts.
257
+ In detail, users can compulsively stop the TAM process and correct the mask of the current frame with positive and negative clicks.
258
+
259
+ \section{Experiments}
260
+
261
+ \subsection{Quantitative Results}
262
+
263
+
264
+ To evaluate TAM, we utilize the validation set of DAVIS-2016 and test-development set of DAVIS-2017~\cite{davis}.
265
+ % The evaluation process follows the one we proposed in Section~\ref{implementation}.
266
+ Then, we execute the proposed TAM as demonstrated in Section~\ref{implementation}.
267
+ The results are given in Table~\ref{davis1617}.
268
+ As shown, our TAM obtains $J\&F$ scores of 88.4 and 73.1 on DAVIS-2016-val and DAVIS-2017-test-dev datasets, respectively.
269
+ Note that TAM is initialized by clicks and evaluated in one pass.
270
+ Notably, we found that TAM performs well when against difficult and complex scenarios.
271
+ % During the evaluation,
272
+
273
+ % click-based interactive video object segmentation
274
+
275
+ % CLICK-BASED INTERACTIVE VIDEO OBJECT
276
+ % SEGMENTATION
277
+
278
+
279
+ \begin{figure}[t]
280
+ \centering
281
+ \includegraphics[width=\linewidth]{figs/davisresults.pdf}
282
+ \caption{Qualitative results on video sequences from DAVIS-16 and DAVIS-17 datasets~\cite{davis}.}
283
+ \label{fig:davisresult}
284
+ \end{figure}
285
+
286
+
287
+ \begin{figure}[t]
288
+ \centering
289
+ \includegraphics[width=\linewidth]{figs/failedcases.pdf}
290
+ \caption{Failed cases.}
291
+ \label{fig:failedcases}
292
+ \end{figure}
293
+
294
+ \subsection{Qualitative Results}
295
+
296
+ % As we use a new one-pass interactive method to evaluation our TAM, here we only present some qualitative results.
297
+ We also give some qualitative results in Figure~\ref{fig:davisresult}.
298
+ As shown, TAM can handle multi-object separation, target deformation, scale change, and camera motion well, which demonstrates its superior tracking and segmentation abilities within only click initialization and one-round inference.
299
+
300
+ \subsection{Failed Cases}
301
+ We here also analyze the failed cases, as shown in Figure~\ref{fig:failedcases}.
302
+ Overall, we notice that the failed cases typically appear on the following two occasions.
303
+ 1)
304
+ % Separated masks of one object in a long video.
305
+ Current VOS models are mostly designed for short videos, which focus more on maintaining short-term memory rather than long-term memory.
306
+ This leads to mask shrinkage or lacking refinement in long-term videos, as shown in seq (a).
307
+ Essentially, we aim to solve them in step 3 by the refinement ability of SAM, while its effectiveness is lower than expected in realistic applications.
308
+ It indicates that the ability of SAM refinement based on multiple prompts can be further improved in the future.
309
+ On the other hand, human participation/interaction in TAM can be an approach to solving such difficulties, while too much interaction will also result in low efficiency.
310
+ Thus, the mechanism of long-term memory preserving and transient memory updating is still important.
311
+ % Limited refinement by SAM. Although SAM supports to refine previous predictions, via point and mask prompts, . How to .
312
+ 2) When the object structure is complex, \textit{e.g.}, the bicycle wheels in seq (b) contain many cavities in groundtruth masks. We found it very difficult to get a fine-grained initialized mask by propagating the clicks.
313
+ Thus, the coarse initialized masks may have side effects on the subsequent frames and lead to poor predictions.
314
+ This also inspires us that SAM is still struggling with complex and precision structures.
315
+
316
+
317
+ \begin{figure}[t]
318
+ \centering
319
+ \includegraphics[width=\linewidth]{figs/avengers_1.pdf}
320
+ \caption{Raw frames, object masks, and inpainted results from the movie \textit{Captain America: Civil War (2016)}.}
321
+ \label{fig:captain}
322
+ \end{figure}
323
+
324
+
325
+
326
+ \section{Applications}
327
+ The proposed Track Anything Model (TAM) provides many possibilities for flexible tracking and segmentation in videos.
328
+ Here, we demonstrate several applications enabled by our proposed method.
329
+ % Our method may be able to a variety of applications.
330
+ In such an interactive way, diverse downstream tasks can be easily achieved.
331
+ % \textbf{Demo.}
332
+ % It is able to solve diverse downstream tasks in such a interactive way.
333
+
334
+ \textbf{Efficient video annotation.}
335
+ TAM has the ability to segment the regions of interest in videos and flexibly choose the objects users want to track. Thus, it can be used for video annotation for tasks like video object tracking and video object segmentation.
336
+ On the other hand, click-based interaction makes it easy to use, and the annotation process is of high efficiency.
337
+
338
+
339
+ \textbf{Long-term object tracking.}
340
+ The study of long-term tracking is gaining more and more attention because it is much closer to practical applications.
341
+ Current long-term object tracking task requires the tracker to have the ability to handle target disappearance and reappearance while it is still limited in the scope of trimmed videos.
342
+ Our TAM is more advanced in real-world applications which can handle the shot changes in long videos.
343
+
344
+
345
+ \textbf{User-friendly video editing.}
346
+ Track Anything Model provides us the opportunities to segment objects
347
+ With the object segmentation masks provided by TAM, we are then able to remove or alter any of the existing objects in a given video.
348
+ Here we combine E$^2$FGVI~\cite{e2fgvi} to evaluate its application value.
349
+
350
+ \textbf{Visualized development toolkit for video tasks.}
351
+ For ease of use, we also provide visualized interfaces for multiple video tasks, \textit{e.g.}, VOS, VOT, video inpainting, and so on.
352
+ With the provided toolkit, users can apply their models on real-world videos and visualize the results instantaneously.
353
+ Corresponding demos are available in Hugging Face\footnote{\url{https://huggingface.co/spaces/watchtowerss/Track-Anything}}.
354
+
355
+
356
+ To show the effectiveness, we give a comprehensive test by applying TAM on the movie \textit{Captain America: Civil War (2016)}.
357
+ Some representative results are given in Figure \ref{fig:captain}.
358
+ As shown, TAM can present multiple object tracking precisely in videos with lots of shot changes and can further be helpful in video inpainting.
359
+
360
+ % \section{Further work}
361
+
362
+
363
+ % \section*{Acknowledgements}
364
+
365
+ % \appendix
366
+
367
+ % \section{Appendix}
368
+
369
+
370
+ % Optionally include extra information (complete proofs, additional experiments and plots) in the appendix.
371
+ % This section will often be part of the supplemental material.
372
+
373
+
374
+
375
+ \bibliographystyle{plain}
376
+ \bibliography{neurips_2022}
377
+
378
+ \end{document}
requirements.txt CHANGED
@@ -12,5 +12,8 @@ pycocotools
12
  matplotlib
13
  onnxruntime
14
  onnx
 
15
  pyyaml
16
  av
 
 
12
  matplotlib
13
  onnxruntime
14
  onnx
15
+ metaseg==0.6.1
16
  pyyaml
17
  av
18
+ mmcv-full
19
+ mmengine
test_sample/test-sample13.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf112202beb75ecf7d04b27758f1f3eedfc218dac5d5dad0b72a07dd2db0f423
3
+ size 59659465
test_sample/test-sample2.mp4 ADDED
Binary file (473 kB). View file
test_sample/test-sample4.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3d739a4b1a0ef3f5b50a9d26b2e767dcc590e6f5463805fc1f659e09d618d4ad
3
+ size 1366182
test_sample/test-sample8.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:01d255ef82222d950d2cfae904d82cc20c752577016f0325b21788fb9b458bb9
3
+ size 11979994
track_anything.py CHANGED
@@ -1,20 +1,18 @@
1
- import sys
2
- sys.path.append("/hhd3/gaoshang/Track-Anything/tracker")
3
  import PIL
4
  from tools.interact_tools import SamControler
5
  from tracker.base_tracker import BaseTracker
 
6
  import numpy as np
7
  import argparse
8
 
9
 
10
 
11
  class TrackingAnything():
12
- def __init__(self, sam_checkpoint, xmem_checkpoint, args):
13
  self.args = args
14
  self.samcontroler = SamControler(sam_checkpoint, args.sam_model_type, args.device)
15
  self.xmem = BaseTracker(xmem_checkpoint, device=args.device)
16
-
17
-
18
  # def inference_step(self, first_flag: bool, interact_flag: bool, image: np.ndarray,
19
  # same_image_flag: bool, points:np.ndarray, labels: np.ndarray, logits: np.ndarray=None, multimask=True):
20
  # if first_flag:
@@ -63,7 +61,7 @@ def parse_augment():
63
  parser.add_argument('--sam_model_type', type=str, default="vit_h")
64
  parser.add_argument('--port', type=int, default=6080, help="only useful when running gradio applications")
65
  parser.add_argument('--debug', action="store_true")
66
- parser.add_argument('--mask_save', default=True)
67
  args = parser.parse_args()
68
 
69
  if args.debug:
 
 
1
  import PIL
2
  from tools.interact_tools import SamControler
3
  from tracker.base_tracker import BaseTracker
4
+ from inpainter.base_inpainter import BaseInpainter
5
  import numpy as np
6
  import argparse
7
 
8
 
9
 
10
  class TrackingAnything():
11
+ def __init__(self, sam_checkpoint, xmem_checkpoint, e2fgvi_checkpoint, args):
12
  self.args = args
13
  self.samcontroler = SamControler(sam_checkpoint, args.sam_model_type, args.device)
14
  self.xmem = BaseTracker(xmem_checkpoint, device=args.device)
15
+ self.baseinpainter = BaseInpainter(e2fgvi_checkpoint, args.device)
 
16
  # def inference_step(self, first_flag: bool, interact_flag: bool, image: np.ndarray,
17
  # same_image_flag: bool, points:np.ndarray, labels: np.ndarray, logits: np.ndarray=None, multimask=True):
18
  # if first_flag:
61
  parser.add_argument('--sam_model_type', type=str, default="vit_h")
62
  parser.add_argument('--port', type=int, default=6080, help="only useful when running gradio applications")
63
  parser.add_argument('--debug', action="store_true")
64
+ parser.add_argument('--mask_save', default=False)
65
  args = parser.parse_args()
66
 
67
  if args.debug:
tracker/.DS_Store CHANGED
Binary files a/tracker/.DS_Store and b/tracker/.DS_Store differ
tracker/base_tracker.py CHANGED
@@ -9,14 +9,14 @@ import yaml
9
  import torch.nn.functional as F
10
  from model.network import XMem
11
  from inference.inference_core import InferenceCore
12
- from util.mask_mapper import MaskMapper
13
  from torchvision import transforms
14
- from util.range_transform import im_normalization
15
- import sys
16
- sys.path.insert(0, sys.path[0]+"/../")
17
  from tools.painter import mask_painter
18
  from tools.base_segmenter import BaseSegmenter
19
  from torchvision.transforms import Resize
 
20
 
21
 
22
  class BaseTracker:
@@ -101,6 +101,8 @@ class BaseTracker:
101
  continue
102
  painted_image = mask_painter(painted_image, (final_mask==obj).astype('uint8'), mask_color=obj+1)
103
 
 
 
104
  return final_mask, final_mask, painted_image
105
 
106
  @torch.no_grad()
@@ -126,50 +128,65 @@ class BaseTracker:
126
  self.mapper.clear_labels()
127
 
128
 
 
 
 
 
 
 
 
 
 
129
  if __name__ == '__main__':
130
- # video frames (multiple objects)
131
  video_path_list = glob.glob(os.path.join('/ssd1/gaomingqi/datasets/davis/JPEGImages/480p/horsejump-high', '*.jpg'))
132
  video_path_list.sort()
133
- # first frame
134
- first_frame_path = '/ssd1/gaomingqi/datasets/davis/Annotations/480p/horsejump-high/00000.png'
135
  # load frames
136
  frames = []
137
  for video_path in video_path_list:
138
  frames.append(np.array(Image.open(video_path).convert('RGB')))
139
- frames = np.stack(frames, 0) # N, H, W, C
140
  # load first frame annotation
 
141
  first_frame_annotation = np.array(Image.open(first_frame_path).convert('P')) # H, W, C
142
 
143
- # ----------------------------------------------------------
144
- # initalise tracker
145
- # ----------------------------------------------------------
146
- device = 'cuda:4'
 
147
  XMEM_checkpoint = '/ssd1/gaomingqi/checkpoints/XMem-s012.pth'
148
- SAM_checkpoint= '/ssd1/gaomingqi/checkpoints/sam_vit_h_4b8939.pth'
149
- model_type = 'vit_h'
150
-
151
- # sam_model = BaseSegmenter(SAM_checkpoint, model_type, device=device)
152
  tracker = BaseTracker(XMEM_checkpoint, device, None, device)
153
-
154
- # # test for storage efficiency
155
- # frames = np.load('/ssd1/gaomingqi/efficiency/efficiency.npy')
156
- # first_frame_annotation = np.array(Image.open('/ssd1/gaomingqi/efficiency/template_mask.png'))
157
-
158
- first_frame_annotation[first_frame_annotation==1] = 15
159
- first_frame_annotation[first_frame_annotation==2] = 20
160
-
161
- save_path = '/ssd1/gaomingqi/results/TrackA/multi-change1'
162
- if not os.path.exists(save_path):
163
- os.mkdir(save_path)
164
-
165
  for ti, frame in enumerate(frames):
166
  if ti == 0:
167
- mask, prob, painted_image = tracker.track(frame, first_frame_annotation)
 
168
  else:
169
- mask, prob, painted_image = tracker.track(frame)
170
- # save
171
- painted_image = Image.fromarray(painted_image)
172
- painted_image.save(f'{save_path}/{ti:05d}.png')
 
 
 
 
 
 
 
 
 
 
 
 
 
173
 
174
  # tracker.clear_memory()
175
  # for ti, frame in enumerate(frames):
@@ -241,6 +258,3 @@ if __name__ == '__main__':
241
  # prob = Image.fromarray((probs[1].cpu().numpy()*255).astype('uint8'))
242
 
243
  # # prob.save(f'/ssd1/gaomingqi/failure/probs/{ti:05d}.png')
244
-
245
-
246
-
9
  import torch.nn.functional as F
10
  from model.network import XMem
11
  from inference.inference_core import InferenceCore
12
+ from tracker.util.mask_mapper import MaskMapper
13
  from torchvision import transforms
14
+ from tracker.util.range_transform import im_normalization
15
+
 
16
  from tools.painter import mask_painter
17
  from tools.base_segmenter import BaseSegmenter
18
  from torchvision.transforms import Resize
19
+ import progressbar
20
 
21
 
22
  class BaseTracker:
101
  continue
102
  painted_image = mask_painter(painted_image, (final_mask==obj).astype('uint8'), mask_color=obj+1)
103
 
104
+ # print(f'max memory allocated: {torch.cuda.max_memory_allocated()/(2**20)} MB')
105
+
106
  return final_mask, final_mask, painted_image
107
 
108
  @torch.no_grad()
128
  self.mapper.clear_labels()
129
 
130
 
131
+ ## how to use:
132
+ ## 1/3) prepare device and xmem_checkpoint
133
+ # device = 'cuda:2'
134
+ # XMEM_checkpoint = '/ssd1/gaomingqi/checkpoints/XMem-s012.pth'
135
+ ## 2/3) initialise Base Tracker
136
+ # tracker = BaseTracker(XMEM_checkpoint, device, None, device) # leave an interface for sam model (currently set None)
137
+ ## 3/3)
138
+
139
+
140
  if __name__ == '__main__':
141
+ # video frames (take videos from DAVIS-2017 as examples)
142
  video_path_list = glob.glob(os.path.join('/ssd1/gaomingqi/datasets/davis/JPEGImages/480p/horsejump-high', '*.jpg'))
143
  video_path_list.sort()
 
 
144
  # load frames
145
  frames = []
146
  for video_path in video_path_list:
147
  frames.append(np.array(Image.open(video_path).convert('RGB')))
148
+ frames = np.stack(frames, 0) # T, H, W, C
149
  # load first frame annotation
150
+ first_frame_path = '/ssd1/gaomingqi/datasets/davis/Annotations/480p/horsejump-high/00000.png'
151
  first_frame_annotation = np.array(Image.open(first_frame_path).convert('P')) # H, W, C
152
 
153
+ # ------------------------------------------------------------------------------------
154
+ # how to use
155
+ # ------------------------------------------------------------------------------------
156
+ # 1/4: set checkpoint and device
157
+ device = 'cuda:2'
158
  XMEM_checkpoint = '/ssd1/gaomingqi/checkpoints/XMem-s012.pth'
159
+ # SAM_checkpoint= '/ssd1/gaomingqi/checkpoints/sam_vit_h_4b8939.pth'
160
+ # model_type = 'vit_h'
161
+ # ------------------------------------------------------------------------------------
162
+ # 2/4: initialise inpainter
163
  tracker = BaseTracker(XMEM_checkpoint, device, None, device)
164
+ # ------------------------------------------------------------------------------------
165
+ # 3/4: for each frame, get tracking results by tracker.track(frame, first_frame_annotation)
166
+ # frame: numpy array (H, W, C), first_frame_annotation: numpy array (H, W), leave it blank when tracking begins
167
+ painted_frames = []
 
 
 
 
 
 
 
 
168
  for ti, frame in enumerate(frames):
169
  if ti == 0:
170
+ mask, prob, painted_frame = tracker.track(frame, first_frame_annotation)
171
+ # mask:
172
  else:
173
+ mask, prob, painted_frame = tracker.track(frame)
174
+ painted_frames.append(painted_frame)
175
+ # ----------------------------------------------
176
+ # 3/4: clear memory in XMEM for the next video
177
+ tracker.clear_memory()
178
+ # ----------------------------------------------
179
+ # end
180
+ # ----------------------------------------------
181
+ print(f'max memory allocated: {torch.cuda.max_memory_allocated()/(2**20)} MB')
182
+ # set saving path
183
+ save_path = '/ssd1/gaomingqi/results/TAM/blackswan'
184
+ if not os.path.exists(save_path):
185
+ os.mkdir(save_path)
186
+ # save
187
+ for painted_frame in progressbar.progressbar(painted_frames):
188
+ painted_frame = Image.fromarray(painted_frame)
189
+ painted_frame.save(f'{save_path}/{ti:05d}.png')
190
 
191
  # tracker.clear_memory()
192
  # for ti, frame in enumerate(frames):
258
  # prob = Image.fromarray((probs[1].cpu().numpy()*255).astype('uint8'))
259
 
260
  # # prob.save(f'/ssd1/gaomingqi/failure/probs/{ti:05d}.png')
 
 
 
tracker/inference/inference_core.py CHANGED
@@ -2,7 +2,7 @@ from inference.memory_manager import MemoryManager
2
  from model.network import XMem
3
  from model.aggregate import aggregate
4
 
5
- from util.tensor_util import pad_divide_by, unpad
6
 
7
 
8
  class InferenceCore:
2
  from model.network import XMem
3
  from model.aggregate import aggregate
4
 
5
+ from tracker.util.tensor_util import pad_divide_by, unpad
6
 
7
 
8
  class InferenceCore: