JohanDL commited on
Commit
f1dd031
1 Parent(s): 1b5a72d

initial commit

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. DOCKERFILE +29 -0
  2. README.md +5 -8
  3. app.py +71 -0
  4. configs/datasets/bdd/bdd_dataset.py +44 -0
  5. configs/datasets/tao/tao_dataset_v05.py +43 -0
  6. configs/datasets/tao/tao_dataset_v1.py +44 -0
  7. configs/default_runtime.py +23 -0
  8. configs/masa-detic/bdd_test/masa_detic_bdd_mot_test.py +224 -0
  9. configs/masa-detic/bdd_test/masa_detic_bdd_mots_test.py +227 -0
  10. configs/masa-detic/open_vocabulary_mot_test/masa_detic_swinb_open_vocabulary_test.py +236 -0
  11. configs/masa-detic/tao_teta_test/masa_detic_swinb_tao_test_detic_dets.py +219 -0
  12. configs/masa-detic/tao_teta_test/masa_detic_swinb_tao_test_teter_swinT_dets.py +219 -0
  13. configs/masa-gdino/bdd_test/masa_gdino_bdd_mot_test.py +226 -0
  14. configs/masa-gdino/bdd_test/masa_gdino_bdd_mots_test.py +227 -0
  15. configs/masa-gdino/masa_gdino_swinb_inference.py +216 -0
  16. configs/masa-gdino/masa_gdino_swinb_plug_and_play.py +218 -0
  17. configs/masa-gdino/open_vocabulary_mot_test/masa_gdino_swinb_open_vocabulary_test.py +236 -0
  18. configs/masa-gdino/tao_teta_test/masa_gdino_swinb_tao_test_detic_dets.py +235 -0
  19. configs/masa-gdino/tao_teta_test/masa_gdino_swinb_tao_test_teter_swinT_dets.py +240 -0
  20. configs/masa-one/bdd_test/masa_r50_bdd_mot_test.py +235 -0
  21. configs/masa-one/bdd_test/masa_r50_bdd_mots_test.py +238 -0
  22. configs/masa-one/masa_r50_plug_and_play.py +214 -0
  23. configs/masa-one/open_vocabulary_mot_test/masa_r50_open_vocabulary_test.py +231 -0
  24. configs/masa-one/tao_teta_test/masa_r50_tao_test_detic_dets.py +230 -0
  25. configs/masa-one/tao_teta_test/masa_r50_tao_test_teter_swinT_dets.py +230 -0
  26. configs/masa-sam/bdd_test/masa_sam_vitb_bdd_mot_test.py +245 -0
  27. configs/masa-sam/bdd_test/masa_sam_vitb_bdd_mots_test.py +241 -0
  28. configs/masa-sam/bdd_test/masa_sam_vith_bdd_mot_test.py +246 -0
  29. configs/masa-sam/bdd_test/masa_sam_vith_bdd_mots_test.py +240 -0
  30. configs/masa-sam/open_vocabulary_mot_test/masa_sam_vitb_open_vocabulary_test.py +233 -0
  31. configs/masa-sam/open_vocabulary_mot_test/masa_sam_vith_open_vocabulary_test.py +234 -0
  32. configs/masa-sam/sam-vitb.py +30 -0
  33. configs/masa-sam/sam-vith.py +30 -0
  34. configs/masa-sam/tao_teta_test/masa_sam_vitb_tao_test_detic_dets.py +232 -0
  35. configs/masa-sam/tao_teta_test/masa_sam_vitb_tao_test_teter_swinT_dets.py +238 -0
  36. configs/masa-sam/tao_teta_test/masa_sam_vith_tao_test_detic_dets.py +233 -0
  37. configs/masa-sam/tao_teta_test/masa_sam_vith_tao_test_teter_swinT_dets.py +239 -0
  38. environment_docker.yml +302 -0
  39. masa/__init__.py +3 -0
  40. masa/__pycache__/__init__.cpython-311.pyc +0 -0
  41. masa/apis/__init__.py +10 -0
  42. masa/apis/__pycache__/__init__.cpython-311.pyc +0 -0
  43. masa/apis/__pycache__/masa_inference.cpython-311.pyc +0 -0
  44. masa/apis/masa_inference.py +297 -0
  45. masa/datasets/__init__.py +19 -0
  46. masa/datasets/__pycache__/__init__.cpython-311.pyc +0 -0
  47. masa/datasets/__pycache__/bdd_masa_dataset.cpython-311.pyc +0 -0
  48. masa/datasets/__pycache__/dataset_wrappers.cpython-311.pyc +0 -0
  49. masa/datasets/__pycache__/masa_dataset.cpython-311.pyc +0 -0
  50. masa/datasets/__pycache__/rsconcat_dataset.cpython-311.pyc +0 -0
DOCKERFILE ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM continuumio/anaconda3:main
2
+
3
+ WORKDIR /code
4
+ COPY ./environment_docker.yml /code/environment_docker.yml
5
+
6
+ # Create the environment using the environment.yml file
7
+ RUN conda env create -f /code/environment_docker.yml
8
+
9
+ # Set up a new user named "user" with user ID 1000
10
+ RUN useradd -m -u 1000 user
11
+ # Switch to the "user" user
12
+ USER user
13
+ # Set home to the user's home directory
14
+ ENV HOME=/home/user \
15
+ PYTHONPATH=$HOME/app \
16
+ PYTHONUNBUFFERED=1 \
17
+ GRADIO_ALLOW_FLAGGING=never \
18
+ GRADIO_NUM_PORTS=1 \
19
+ GRADIO_SERVER_NAME=0.0.0.0 \
20
+ GRADIO_THEME=huggingface \
21
+ SYSTEM=spaces
22
+
23
+ # Set the working directory to the user's home directory
24
+ WORKDIR $HOME/app
25
+
26
+ # Copy the current directory contents into the container at $HOME/app setting the owner to the user
27
+ COPY --chown=user . $HOME/app
28
+
29
+ CMD ["./run.sh"]
README.md CHANGED
@@ -1,11 +1,8 @@
1
  ---
2
- title: MASA GroundingDINO
3
- emoji: 🌍
4
- colorFrom: red
5
- colorTo: pink
6
  sdk: docker
7
- pinned: false
8
- license: mit
9
  ---
10
-
11
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: MASA + GroundingDINO Space
3
+ emoji: 🐳
4
+ colorFrom: purple
5
+ colorTo: gray
6
  sdk: docker
7
+ app_port: 7860
 
8
  ---
 
 
app.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import tempfile
4
+ import subprocess
5
+
6
+ # Define the function to call the command line script
7
+ def process_video(uploaded_video_path, texts):
8
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') as tmpfile:
9
+ output_video_path = tmpfile.name
10
+
11
+ command = [
12
+ "python", "demo/video_demo_with_text.py", uploaded_video_path,
13
+ "--out", output_video_path,
14
+ "--masa_config", "configs/masa-gdino/masa_gdino_swinb_inference.py",
15
+ "--masa_checkpoint", "saved_models/masa_models/gdino_masa.pth",
16
+ "--texts", texts,
17
+ "--score-thr", "0.2",
18
+ "--unified",
19
+ "--show_fps"
20
+ ]
21
+
22
+ subprocess.run(command, check=True)
23
+
24
+ # Ensure the video is in a compatible format using ffmpeg
25
+ converted_output_path = output_video_path.replace('.mp4', '_converted.mp4')
26
+ ffmpeg_command = [
27
+ "ffmpeg", "-i", output_video_path, "-c:v", "mpeg4",
28
+ "-c:a", "aac", "-b:a", "128k", "-movflags", "+faststart", converted_output_path
29
+ ]
30
+ subprocess.run(ffmpeg_command, check=True)
31
+
32
+ return converted_output_path
33
+
34
+ css = """
35
+ #img-display-container {
36
+ max-height: 100vh;
37
+ }
38
+ #img-display-input {
39
+ max-height: 80vh;
40
+ }
41
+ #img-display-output {
42
+ max-height: 80vh;
43
+ }
44
+ """
45
+
46
+ title = "# MASA Track Everything Demo"
47
+ description = """ MASA + GroundingDINO on your video files!
48
+ Please refer to our [paper](https://arxiv.org/abs/2406.04221), [project page](https://matchinganything.github.io/), or [github](https://github.com/siyuanliii/masa/tree/main?tab=readme-ov-file) for more details."""
49
+
50
+ with gr.Blocks(css=css) as demo:
51
+ gr.Markdown(title)
52
+ gr.Markdown(description)
53
+ gr.Markdown("### Video Object Tracking demo")
54
+
55
+ with gr.Row():
56
+ input_video = gr.Video(label="Input Video")
57
+ input_texts = gr.Textbox(label="Input Texts")
58
+
59
+ submit = gr.Button("Submit")
60
+ processed_video = gr.Video(label="Processed Video")
61
+
62
+ submit.click(process_video, inputs=[input_video, input_texts], outputs=processed_video)
63
+
64
+ example_files = os.listdir('assets/examples_video')
65
+ example_files.sort()
66
+ example_files = [os.path.join('assets/examples_video', filename) for filename in example_files]
67
+ examples = gr.Examples(examples=example_files, inputs=[input_video, input_texts], outputs=processed_video, fn=process_video, cache_examples=True)
68
+
69
+ if __name__ == '__main__':
70
+ demo.queue().launch()
71
+
configs/datasets/bdd/bdd_dataset.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # dataset settings
2
+ img_norm_cfg = dict(
3
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
4
+
5
+
6
+ test_dataset_tpye = 'BDDVideoDataset'
7
+
8
+ test_pipeline = [
9
+ dict(
10
+ type='TransformBroadcaster',
11
+ transforms=[
12
+ dict(type='LoadImageFromFile'),
13
+ dict(type='Resize', scale=(1333, 800), keep_ratio=True),
14
+ dict(type='LoadTrackAnnotations')
15
+ ]),
16
+ dict(type='PackTrackInputs')
17
+ ]
18
+
19
+ val_dataloader = dict(
20
+ batch_size=1,
21
+ num_workers=2,
22
+ persistent_workers=True,
23
+ sampler=dict(type='TrackImgSampler'),
24
+ dataset=dict(
25
+ type=test_dataset_tpye,
26
+ ann_file='data/bdd/annotations/box_track_20/box_track_val_cocofmt.json',
27
+ data_prefix=dict(img_path='data/bdd/bdd100k/images/track/val/'),
28
+ test_mode=True,
29
+ pipeline=test_pipeline
30
+ ))
31
+
32
+ test_dataloader = val_dataloader
33
+
34
+ # evaluator
35
+ val_evaluator = dict(
36
+ type='BDDTETAMetric',
37
+ dataset_type=test_dataset_tpye,
38
+ format_only=False,
39
+ ann_file='data/bdd/annotations/box_track_20/box_track_val_cocofmt.json',
40
+ scalabel_gt='data/bdd/annotations/scalabel_gt/box_track_20/val/',
41
+ metric=['TETA'])
42
+ test_evaluator = val_evaluator
43
+
44
+
configs/datasets/tao/tao_dataset_v05.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # data pipeline
2
+
3
+ test_pipeline = [
4
+ dict(
5
+ type='TransformBroadcaster',
6
+ transforms=[
7
+ dict(type='LoadImageFromFile'),
8
+ dict(type='Resize', scale=(1333, 800), keep_ratio=True),
9
+ dict(type='LoadTrackAnnotations')
10
+ ]),
11
+ dict(type='PackTrackInputs')
12
+ ]
13
+
14
+ # dataloader
15
+
16
+ test_dataset_tpye = 'Taov05Dataset'
17
+
18
+ val_dataloader = dict(
19
+ batch_size=1,
20
+ num_workers=2,
21
+ persistent_workers=True,
22
+ # Now we support two ways to test, image_based and video_based
23
+ # if you want to use video_based sampling, you can use as follows
24
+ sampler=dict(type='TrackImgSampler'), # image-based sampling
25
+ dataset=dict(
26
+ type=test_dataset_tpye,
27
+ ann_file='data/tao/annotations/tao_val_lvis_v05_classes.json',
28
+ data_prefix=dict(img_path='data/tao/frames/'),
29
+ test_mode=True,
30
+ pipeline=test_pipeline
31
+ ))
32
+ test_dataloader = val_dataloader
33
+
34
+ # evaluator
35
+ val_evaluator = dict(
36
+ type='TaoTETAMetric',
37
+ dataset_type=test_dataset_tpye,
38
+ format_only=False,
39
+ ann_file='data/tao/annotations/tao_val_lvis_v05_classes.json',
40
+ metric=['TETA'])
41
+ test_evaluator = val_evaluator
42
+
43
+
configs/datasets/tao/tao_dataset_v1.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # data pipeline
2
+
3
+ test_pipeline = [
4
+ dict(
5
+ type='TransformBroadcaster',
6
+ transforms=[
7
+ dict(type='LoadImageFromFile'),
8
+ dict(type='Resize', scale=(1333, 800), keep_ratio=True),
9
+ dict(type='LoadTrackAnnotations')
10
+ ]),
11
+ dict(type='PackTrackInputs')
12
+ ]
13
+
14
+ # dataloader
15
+
16
+ test_dataset_tpye = 'Taov1Dataset'
17
+
18
+ val_dataloader = dict(
19
+ batch_size=1,
20
+ num_workers=2,
21
+ persistent_workers=True,
22
+ # Now we support two ways to test, image_based and video_based
23
+ # if you want to use video_based sampling, you can use as follows
24
+ sampler=dict(type='TrackImgSampler'), # image-based sampling
25
+ dataset=dict(
26
+ type=test_dataset_tpye,
27
+ ann_file='data/tao/annotations/tao_val_lvis_v1_classes.json',
28
+ data_prefix=dict(img_path='data/tao/frames/'),
29
+ test_mode=True,
30
+ pipeline=test_pipeline
31
+ ))
32
+
33
+ test_dataloader = val_dataloader
34
+
35
+ # evaluator
36
+ val_evaluator = dict(
37
+ type='TaoTETAMetric',
38
+ dataset_type=test_dataset_tpye,
39
+ format_only=False,
40
+ ann_file='data/tao/annotations/tao_val_lvis_v1_classes.json',
41
+ metric=['TETA'])
42
+ test_evaluator = val_evaluator
43
+
44
+
configs/default_runtime.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ default_scope = 'mmdet'
2
+ default_hooks = dict(
3
+ timer=dict(type='IterTimerHook'),
4
+ logger=dict(type='LoggerHook', interval=50),
5
+ param_scheduler=dict(type='ParamSchedulerHook'),
6
+ checkpoint=dict(type='CheckpointHook', interval=1),
7
+ sampler_seed=dict(type='DistSamplerSeedHook'),
8
+ visualization=dict(type='DetVisualizationHook'))
9
+
10
+ env_cfg = dict(
11
+ cudnn_benchmark=False,
12
+ mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
13
+ dist_cfg=dict(backend='nccl'),
14
+ )
15
+
16
+ vis_backends = [dict(type='LocalVisBackend')]
17
+ visualizer = dict(
18
+ type='DetLocalVisualizer', vis_backends=vis_backends, name='visualizer')
19
+ log_processor = dict(type='LogProcessor', window_size=50, by_epoch=True)
20
+
21
+ log_level = 'INFO'
22
+ load_from = None
23
+ resume = False
configs/masa-detic/bdd_test/masa_detic_bdd_mot_test.py ADDED
@@ -0,0 +1,224 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _base_ = [
2
+ '../../../projects/Detic_new/configs/detic_centernet2_swin-b_fpn_4x_lvis-base_in21k-lvis-masa.py',
3
+ '../../datasets/bdd/bdd_dataset.py',
4
+ '../../default_runtime.py'
5
+ ]
6
+ default_scope = 'mmdet'
7
+ detector = _base_.model
8
+ detector.pop('data_preprocessor')
9
+ detector['init_cfg'] = dict(
10
+ type='Pretrained',
11
+ checkpoint= 'saved_models/tsa_models/detic_centernet2_swin-b_fpn_4x_lvis-base_in21k-lvis-ec91245d.pth'
12
+ # noqa: E501
13
+ )
14
+ detector['type'] = 'DeticMasa'
15
+
16
+ del _base_.model
17
+
18
+ model = dict(
19
+ type='MASA',
20
+ freeze_detector=True,
21
+ unified_backbone=True,
22
+ load_public_dets = True,
23
+ benchmark = 'bdd',
24
+ public_det_path = 'results/public_dets/bdd_mot_yolox_dets/',
25
+ data_preprocessor=dict(
26
+ type='TrackDataPreprocessor',
27
+ # Image normalization parameters
28
+ mean=[123.675, 116.28, 103.53],
29
+ std=[58.395, 57.12, 57.375],
30
+ bgr_to_rgb=True,
31
+ # Image padding parameters
32
+ pad_mask=True, # In instance segmentation, the mask needs to be padded
33
+ pad_size_divisor=32), # Padding the image to multiples of 32
34
+
35
+ detector=detector,
36
+ masa_adapter=[
37
+ dict(
38
+ type='FPN',
39
+ in_channels=[256, 512, 1024],
40
+ out_channels=256,
41
+ norm_cfg=dict(type='SyncBN', requires_grad=True),
42
+ num_outs=5),
43
+ dict(
44
+ type='DeformFusion',
45
+ in_channels=256,
46
+ out_channels=256,
47
+ num_blocks=3)],
48
+ rpn_head=dict(
49
+ type='RPNHead',
50
+ in_channels=256,
51
+ feat_channels=256,
52
+ anchor_generator=dict(
53
+ type='AnchorGenerator',
54
+ scales=[8],
55
+ ratios=[0.5, 1.0, 2.0],
56
+ strides=[8, 16, 32, 64, 128]),
57
+ bbox_coder=dict(
58
+ type='DeltaXYWHBBoxCoder',
59
+ target_means=[.0, .0, .0, .0],
60
+ target_stds=[1.0, 1.0, 1.0, 1.0]),
61
+ loss_cls=dict(
62
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
63
+ loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)
64
+ ),
65
+ roi_head=dict(
66
+ type='StandardRoIHead',
67
+ bbox_roi_extractor=dict(
68
+ type='SingleRoIExtractor',
69
+ roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
70
+ out_channels=256,
71
+ featmap_strides=[8, 16, 32]),
72
+ bbox_head=dict(
73
+ type='Shared2FCBBoxHead',
74
+ in_channels=256,
75
+ fc_out_channels=1024,
76
+ roi_feat_size=7,
77
+ num_classes=1,
78
+ bbox_coder=dict(
79
+ type='DeltaXYWHBBoxCoder',
80
+ target_means=[0., 0., 0., 0.],
81
+ target_stds=[0.1, 0.1, 0.2, 0.2]),
82
+ reg_class_agnostic=True,
83
+ loss_cls=dict(
84
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
85
+ loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
86
+ # model training and testing settings
87
+ train_cfg=dict(
88
+ rpn=dict(
89
+ assigner=dict(
90
+ type='MaxIoUAssigner',
91
+ pos_iou_thr=0.7,
92
+ neg_iou_thr=0.3,
93
+ min_pos_iou=0.3,
94
+ match_low_quality=True,
95
+ ignore_iof_thr=-1),
96
+ sampler=dict(
97
+ type='RandomSampler',
98
+ num=256,
99
+ pos_fraction=0.5,
100
+ neg_pos_ub=-1,
101
+ add_gt_as_proposals=False),
102
+ allowed_border=-1,
103
+ pos_weight=-1,
104
+ debug=False),
105
+ rpn_proposal=dict(
106
+ nms_pre=2000,
107
+ max_per_img=1000,
108
+ nms=dict(type='nms', iou_threshold=0.7),
109
+ min_bbox_size=0),
110
+ rcnn=dict(
111
+ assigner=dict(
112
+ type='MaxIoUAssigner',
113
+ pos_iou_thr=0.5,
114
+ neg_iou_thr=0.5,
115
+ min_pos_iou=0.5,
116
+ match_low_quality=False,
117
+ ignore_iof_thr=-1),
118
+ sampler=dict(
119
+ type='RandomSampler',
120
+ num=512,
121
+ pos_fraction=0.25,
122
+ neg_pos_ub=-1,
123
+ add_gt_as_proposals=True),
124
+ pos_weight=-1,
125
+ debug=False)),
126
+ test_cfg=dict(
127
+ rpn=dict(
128
+ nms_pre=1000,
129
+ max_per_img=1000,
130
+ nms=dict(type='nms', iou_threshold=0.7),
131
+ min_bbox_size=0),
132
+ rcnn=dict(
133
+ score_thr=0.02,
134
+ nms=dict(type='nms',
135
+ iou_threshold=0.5,
136
+ class_agnostic=True,
137
+ split_thr=100000),
138
+ max_per_img=50,
139
+ mask_thr_binary=0.5)
140
+ # soft-nms is also supported for rcnn testing
141
+ # e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05)
142
+ ),
143
+ track_head=dict(
144
+ type='MasaTrackHead',
145
+ roi_extractor=dict(
146
+ type='SingleRoIExtractor',
147
+ roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
148
+ out_channels=256,
149
+ featmap_strides=[8, 16, 32]),
150
+ embed_head=dict(
151
+ type='QuasiDenseEmbedHead',
152
+ num_convs=4,
153
+ num_fcs=1,
154
+ embed_channels=256,
155
+ norm_cfg=dict(type='GN', num_groups=32),
156
+ loss_track=dict(type='UnbiasedContrastLoss', loss_weight=0.25),
157
+ loss_track_aux=dict(
158
+ type='MarginL2Loss',
159
+ neg_pos_ub=3,
160
+ pos_margin=0,
161
+ neg_margin=0.1,
162
+ hard_mining=True,
163
+ loss_weight=1.0)),
164
+ # loss_bbox=dict(type='L1Loss', loss_weight=1.0),
165
+ train_cfg=dict(
166
+ assigner=dict(
167
+ type='MaxIoUAssigner',
168
+ pos_iou_thr=0.7,
169
+ neg_iou_thr=0.3,
170
+ min_pos_iou=0.5,
171
+ match_low_quality=False,
172
+ ignore_iof_thr=-1),
173
+ sampler=dict(
174
+ type='CombinedSampler',
175
+ num=512,
176
+ pos_fraction=0.5,
177
+ neg_pos_ub=3,
178
+ add_gt_as_proposals=True,
179
+ pos_sampler=dict(type='InstanceBalancedPosSampler'),
180
+ neg_sampler=dict(type='RandomSampler')))),
181
+ tracker=dict(
182
+ type='MasaBDDTracker',
183
+ init_score_thr=0.5,
184
+ obj_score_thr=0.3,
185
+ match_score_thr=0.6,
186
+ memo_tracklet_frames=10,
187
+ memo_backdrop_frames=1,
188
+ memo_momentum=0.8,
189
+ nms_conf_thr=0.5,
190
+ nms_backdrop_iou_thr=0.3,
191
+ nms_class_iou_thr=0.7,
192
+ with_cats=False,
193
+ match_metric='bisoftmax')
194
+ )
195
+
196
+ # runtime settings
197
+ train_dataloader = None
198
+ train_cfg = None
199
+ val_cfg = dict(type='ValLoop')
200
+ test_cfg = dict(type='TestLoop')
201
+
202
+ default_hooks = dict(
203
+ logger=dict(type='LoggerHook', interval=50),
204
+ visualization=dict(type='TrackVisualizationHook', draw=False),
205
+ checkpoint=dict(type='CheckpointHook', interval=1),
206
+ )
207
+
208
+ vis_backends = [dict(type='LocalVisBackend')]
209
+ visualizer = dict(
210
+ type='MasaTrackLocalVisualizer', vis_backends=vis_backends, name='visualizer')
211
+
212
+ val_dataloader = dict(
213
+ dataset=dict(
214
+ ann_file='data/bdd/annotations/box_track_20/box_track_val_cocofmt.json',
215
+ )
216
+ )
217
+ test_dataloader = val_dataloader
218
+ val_evaluator = dict(
219
+ ann_file='data/bdd/annotations/box_track_20/box_track_val_cocofmt.json',
220
+ scalabel_gt='data/bdd/annotations/scalabel_gt/box_track_20/val/',
221
+ outfile_prefix='results/detic_masa_trained_bdd_demo',
222
+ metric=['TETA', 'HOTA', 'CLEAR']
223
+ )
224
+ test_evaluator = val_evaluator
configs/masa-detic/bdd_test/masa_detic_bdd_mots_test.py ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _base_ = [
2
+ '../../projects/Detic_new/configs/detic_centernet2_swin-b_fpn_4x_lvis-base_in21k-lvis-masa.py',
3
+ '../datasets/bdd/bdd_dataset.py',
4
+ '../default_runtime.py'
5
+ ]
6
+ default_scope = 'mmdet'
7
+ detector = _base_.model
8
+ detector.pop('data_preprocessor')
9
+ detector['init_cfg'] = dict(
10
+ type='Pretrained',
11
+ checkpoint= 'saved_models/tsa_models/detic_centernet2_swin-b_fpn_4x_lvis-base_in21k-lvis-ec91245d.pth'
12
+ # noqa: E501
13
+ )
14
+ detector['type'] = 'DeticMasa'
15
+
16
+ del _base_.model
17
+
18
+ model = dict(
19
+ type='MASA',
20
+ freeze_detector=True,
21
+ unified_backbone=True,
22
+ load_public_dets = True,
23
+ with_segm=True,
24
+ benchmark = 'bdd',
25
+ public_det_path = 'results/public_dets/bdd_mots_val_uninext_dets/',
26
+ data_preprocessor=dict(
27
+ type='TrackDataPreprocessor',
28
+ # Image normalization parameters
29
+ mean=[123.675, 116.28, 103.53],
30
+ std=[58.395, 57.12, 57.375],
31
+ bgr_to_rgb=True,
32
+ # Image padding parameters
33
+ pad_mask=True, # In instance segmentation, the mask needs to be padded
34
+ pad_size_divisor=32), # Padding the image to multiples of 32
35
+
36
+ detector=detector,
37
+ masa_adapter=[
38
+ dict(
39
+ type='FPN',
40
+ in_channels=[256, 512, 1024],
41
+ out_channels=256,
42
+ norm_cfg=dict(type='SyncBN', requires_grad=True),
43
+ num_outs=5),
44
+ dict(
45
+ type='DeformFusion',
46
+ in_channels=256,
47
+ out_channels=256,
48
+ num_blocks=3)],
49
+ rpn_head=dict(
50
+ type='RPNHead',
51
+ in_channels=256,
52
+ feat_channels=256,
53
+ anchor_generator=dict(
54
+ type='AnchorGenerator',
55
+ scales=[8],
56
+ ratios=[0.5, 1.0, 2.0],
57
+ strides=[8, 16, 32, 64, 128]),
58
+ bbox_coder=dict(
59
+ type='DeltaXYWHBBoxCoder',
60
+ target_means=[.0, .0, .0, .0],
61
+ target_stds=[1.0, 1.0, 1.0, 1.0]),
62
+ loss_cls=dict(
63
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
64
+ loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)
65
+ ),
66
+ roi_head=dict(
67
+ type='StandardRoIHead',
68
+ bbox_roi_extractor=dict(
69
+ type='SingleRoIExtractor',
70
+ roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
71
+ out_channels=256,
72
+ featmap_strides=[8, 16, 32]),
73
+ bbox_head=dict(
74
+ type='Shared2FCBBoxHead',
75
+ in_channels=256,
76
+ fc_out_channels=1024,
77
+ roi_feat_size=7,
78
+ num_classes=1,
79
+ bbox_coder=dict(
80
+ type='DeltaXYWHBBoxCoder',
81
+ target_means=[0., 0., 0., 0.],
82
+ target_stds=[0.1, 0.1, 0.2, 0.2]),
83
+ reg_class_agnostic=True,
84
+ loss_cls=dict(
85
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
86
+ loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
87
+ # model training and testing settings
88
+ train_cfg=dict(
89
+ rpn=dict(
90
+ assigner=dict(
91
+ type='MaxIoUAssigner',
92
+ pos_iou_thr=0.7,
93
+ neg_iou_thr=0.3,
94
+ min_pos_iou=0.3,
95
+ match_low_quality=True,
96
+ ignore_iof_thr=-1),
97
+ sampler=dict(
98
+ type='RandomSampler',
99
+ num=256,
100
+ pos_fraction=0.5,
101
+ neg_pos_ub=-1,
102
+ add_gt_as_proposals=False),
103
+ allowed_border=-1,
104
+ pos_weight=-1,
105
+ debug=False),
106
+ rpn_proposal=dict(
107
+ nms_pre=2000,
108
+ max_per_img=1000,
109
+ nms=dict(type='nms', iou_threshold=0.7),
110
+ min_bbox_size=0),
111
+ rcnn=dict(
112
+ assigner=dict(
113
+ type='MaxIoUAssigner',
114
+ pos_iou_thr=0.5,
115
+ neg_iou_thr=0.5,
116
+ min_pos_iou=0.5,
117
+ match_low_quality=False,
118
+ ignore_iof_thr=-1),
119
+ sampler=dict(
120
+ type='RandomSampler',
121
+ num=512,
122
+ pos_fraction=0.25,
123
+ neg_pos_ub=-1,
124
+ add_gt_as_proposals=True),
125
+ pos_weight=-1,
126
+ debug=False)),
127
+ test_cfg=dict(
128
+ rpn=dict(
129
+ nms_pre=1000,
130
+ max_per_img=1000,
131
+ nms=dict(type='nms', iou_threshold=0.7),
132
+ min_bbox_size=0),
133
+ rcnn=dict(
134
+ score_thr=0.02,
135
+ # nms=dict(type='nms', iou_threshold=0.5),
136
+ nms=dict(type='nms',
137
+ iou_threshold=0.5,
138
+ class_agnostic=True,
139
+ split_thr=100000),
140
+ max_per_img=50,
141
+ mask_thr_binary=0.5)
142
+ # soft-nms is also supported for rcnn testing
143
+ # e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05)
144
+ ),
145
+ track_head=dict(
146
+ type='MasaTrackHead',
147
+ roi_extractor=dict(
148
+ type='SingleRoIExtractor',
149
+ roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
150
+ out_channels=256,
151
+ featmap_strides=[8, 16, 32]),
152
+ embed_head=dict(
153
+ type='QuasiDenseEmbedHead',
154
+ num_convs=4,
155
+ num_fcs=1,
156
+ embed_channels=256,
157
+ norm_cfg=dict(type='GN', num_groups=32),
158
+ loss_track=dict(type='UnbiasedContrastLoss', loss_weight=0.25),
159
+ loss_track_aux=dict(
160
+ type='MarginL2Loss',
161
+ neg_pos_ub=3,
162
+ pos_margin=0,
163
+ neg_margin=0.1,
164
+ hard_mining=True,
165
+ loss_weight=1.0)),
166
+ # loss_bbox=dict(type='L1Loss', loss_weight=1.0),
167
+ train_cfg=dict(
168
+ assigner=dict(
169
+ type='MaxIoUAssigner',
170
+ pos_iou_thr=0.7,
171
+ neg_iou_thr=0.3,
172
+ min_pos_iou=0.5,
173
+ match_low_quality=False,
174
+ ignore_iof_thr=-1),
175
+ sampler=dict(
176
+ type='CombinedSampler',
177
+ num=512,
178
+ pos_fraction=0.5,
179
+ neg_pos_ub=3,
180
+ add_gt_as_proposals=True,
181
+ pos_sampler=dict(type='InstanceBalancedPosSampler'),
182
+ neg_sampler=dict(type='RandomSampler')))),
183
+ tracker=dict(
184
+ type='MasaBDDTracker',
185
+ init_score_thr=0.5,
186
+ obj_score_thr=0.3,
187
+ match_score_thr=0.6,
188
+ memo_tracklet_frames=10,
189
+ memo_backdrop_frames=1,
190
+ memo_momentum=0.8,
191
+ nms_conf_thr=0.5,
192
+ nms_backdrop_iou_thr=0.3,
193
+ nms_class_iou_thr=0.7,
194
+ with_cats=False,
195
+ match_metric='bisoftmax')
196
+ )
197
+
198
+ # runtime settings
199
+ train_dataloader = None
200
+ train_cfg = None
201
+ val_cfg = dict(type='ValLoop')
202
+ test_cfg = dict(type='TestLoop')
203
+
204
+ default_hooks = dict(
205
+ logger=dict(type='LoggerHook', interval=50),
206
+ visualization=dict(type='TrackVisualizationHook', draw=False),
207
+ checkpoint = dict(type='CheckpointHook', interval=1),
208
+ )
209
+
210
+ vis_backends = [dict(type='LocalVisBackend')]
211
+ visualizer = dict(
212
+ type='MasaTrackLocalVisualizer', vis_backends=vis_backends, name='visualizer')
213
+
214
+ val_dataloader = dict(
215
+ dataset=dict(
216
+ ann_file='data/bdd/annotations/seg_track_val_cocofmt.json',
217
+ )
218
+ )
219
+ test_dataloader = val_dataloader
220
+ val_evaluator = dict(
221
+ ann_file='data/bdd/annotations/seg_track_val_cocofmt.json',
222
+ scalabel_gt='data/bdd/annotations/scalabel_gt/seg_track_20/val/',
223
+ outfile_prefix='results/masa_results/masa-groundingdino-release-bdd-mots-test',
224
+ metric=['TETA', 'HOTA', 'CLEAR'],
225
+ with_mask=True,
226
+ )
227
+ test_evaluator = val_evaluator
configs/masa-detic/open_vocabulary_mot_test/masa_detic_swinb_open_vocabulary_test.py ADDED
@@ -0,0 +1,236 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _base_ = [
2
+ '../../../projects/Detic_new/configs/detic_centernet2_swin-b_fpn_4x_lvis-base_in21k-lvis-masa.py',
3
+ '../../datasets/tao/tao_dataset_v1.py',
4
+ '../../default_runtime.py'
5
+ ]
6
+ default_scope = 'mmdet'
7
+ detector = _base_.model
8
+ detector.pop('data_preprocessor')
9
+ detector['init_cfg'] = dict(
10
+ type='Pretrained',
11
+ checkpoint= 'saved_models/tsa_models/detic_centernet2_swin-b_fpn_4x_lvis-base_in21k-lvis-ec91245d.pth'
12
+ # noqa: E501
13
+ )
14
+ detector['type'] = 'DeticMasa'
15
+ detector['test_cfg'] =dict(
16
+ rpn=dict(
17
+ score_thr=0.0001,
18
+ nms_pre=1000,
19
+ max_per_img=256,
20
+ nms=dict(type='nms', iou_threshold=0.9),
21
+ min_bbox_size=0),
22
+ rcnn=dict(
23
+ score_thr=0.02,
24
+ nms=dict(type='nms',
25
+ iou_threshold=0.5,
26
+ class_agnostic=True,
27
+ split_thr=100000),
28
+ max_per_img=50,
29
+ mask_thr_binary=0.5)
30
+ )
31
+
32
+ del _base_.model
33
+
34
+ model = dict(
35
+ type='MASA',
36
+ freeze_detector=True,
37
+ unified_backbone=True,
38
+ load_public_dets = False,
39
+ benchmark = 'tao',
40
+ public_det_path = 'results/public_dets/tao_val_dets/teta_50_internms/detic_tao_val_det/',
41
+ data_preprocessor=dict(
42
+ type='TrackDataPreprocessor',
43
+ # Image normalization parameters
44
+ mean=[123.675, 116.28, 103.53],
45
+ std=[58.395, 57.12, 57.375],
46
+ bgr_to_rgb=True,
47
+ # Image padding parameters
48
+ pad_mask=True, # In instance segmentation, the mask needs to be padded
49
+ pad_size_divisor=32), # Padding the image to multiples of 32
50
+ detector=detector,
51
+ masa_adapter=[
52
+ dict(
53
+ type='FPN',
54
+ in_channels=[256, 512, 1024],
55
+ out_channels=256,
56
+ norm_cfg=dict(type='SyncBN', requires_grad=True),
57
+ num_outs=5),
58
+ dict(
59
+ type='DeformFusion',
60
+ in_channels=256,
61
+ out_channels=256,
62
+ num_blocks=3)],
63
+ rpn_head=dict(
64
+ type='RPNHead',
65
+ in_channels=256,
66
+ feat_channels=256,
67
+ anchor_generator=dict(
68
+ type='AnchorGenerator',
69
+ scales=[8],
70
+ ratios=[0.5, 1.0, 2.0],
71
+ strides=[8, 16, 32, 64, 128]),
72
+ bbox_coder=dict(
73
+ type='DeltaXYWHBBoxCoder',
74
+ target_means=[.0, .0, .0, .0],
75
+ target_stds=[1.0, 1.0, 1.0, 1.0]),
76
+ loss_cls=dict(
77
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
78
+ loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)
79
+ ),
80
+ roi_head=dict(
81
+ type='StandardRoIHead',
82
+ bbox_roi_extractor=dict(
83
+ type='SingleRoIExtractor',
84
+ roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
85
+ out_channels=256,
86
+ featmap_strides=[8, 16, 32]),
87
+ bbox_head=dict(
88
+ type='Shared2FCBBoxHead',
89
+ in_channels=256,
90
+ fc_out_channels=1024,
91
+ roi_feat_size=7,
92
+ num_classes=1,
93
+ bbox_coder=dict(
94
+ type='DeltaXYWHBBoxCoder',
95
+ target_means=[0., 0., 0., 0.],
96
+ target_stds=[0.1, 0.1, 0.2, 0.2]),
97
+ reg_class_agnostic=True,
98
+ loss_cls=dict(
99
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
100
+ loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
101
+ # model training and testing settings
102
+ train_cfg=dict(
103
+ rpn=dict(
104
+ assigner=dict(
105
+ type='MaxIoUAssigner',
106
+ pos_iou_thr=0.7,
107
+ neg_iou_thr=0.3,
108
+ min_pos_iou=0.3,
109
+ match_low_quality=True,
110
+ ignore_iof_thr=-1),
111
+ sampler=dict(
112
+ type='RandomSampler',
113
+ num=256,
114
+ pos_fraction=0.5,
115
+ neg_pos_ub=-1,
116
+ add_gt_as_proposals=False),
117
+ allowed_border=-1,
118
+ pos_weight=-1,
119
+ debug=False),
120
+ rpn_proposal=dict(
121
+ nms_pre=2000,
122
+ max_per_img=1000,
123
+ nms=dict(type='nms', iou_threshold=0.7),
124
+ min_bbox_size=0),
125
+ rcnn=dict(
126
+ assigner=dict(
127
+ type='MaxIoUAssigner',
128
+ pos_iou_thr=0.5,
129
+ neg_iou_thr=0.5,
130
+ min_pos_iou=0.5,
131
+ match_low_quality=False,
132
+ ignore_iof_thr=-1),
133
+ sampler=dict(
134
+ type='RandomSampler',
135
+ num=512,
136
+ pos_fraction=0.25,
137
+ neg_pos_ub=-1,
138
+ add_gt_as_proposals=True),
139
+ pos_weight=-1,
140
+ debug=False)),
141
+ test_cfg=dict(
142
+ rpn=dict(
143
+ nms_pre=1000,
144
+ max_per_img=1000,
145
+ nms=dict(type='nms', iou_threshold=0.7),
146
+ min_bbox_size=0),
147
+ rcnn=dict(
148
+ score_thr=0.05,
149
+ nms=dict(type='nms', iou_threshold=0.5),
150
+ max_per_img=100)
151
+ # soft-nms is also supported for rcnn testing
152
+ # e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05)
153
+ ),
154
+ track_head=dict(
155
+ type='MasaTrackHead',
156
+ roi_extractor=dict(
157
+ type='SingleRoIExtractor',
158
+ roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
159
+ out_channels=256,
160
+ featmap_strides=[8, 16, 32]),
161
+ embed_head=dict(
162
+ type='QuasiDenseEmbedHead',
163
+ num_convs=4,
164
+ num_fcs=1,
165
+ embed_channels=256,
166
+ norm_cfg=dict(type='GN', num_groups=32),
167
+ loss_track=dict(type='UnbiasedContrastLoss', loss_weight=0.25),
168
+ loss_track_aux=dict(
169
+ type='MarginL2Loss',
170
+ neg_pos_ub=3,
171
+ pos_margin=0,
172
+ neg_margin=0.1,
173
+ hard_mining=True,
174
+ loss_weight=1.0)),
175
+ # loss_bbox=dict(type='L1Loss', loss_weight=1.0),
176
+ train_cfg=dict(
177
+ assigner=dict(
178
+ type='MaxIoUAssigner',
179
+ pos_iou_thr=0.7,
180
+ neg_iou_thr=0.5,
181
+ min_pos_iou=0.5,
182
+ match_low_quality=False,
183
+ ignore_iof_thr=-1),
184
+ sampler=dict(
185
+ type='CombinedSampler',
186
+ num=512,
187
+ pos_fraction=0.8,
188
+ neg_pos_ub=3,
189
+ add_gt_as_proposals=True,
190
+ pos_sampler=dict(type='InstanceBalancedPosSampler'),
191
+ neg_sampler=dict(type='RandomSampler')))),
192
+ tracker=dict(
193
+ type='MasaTaoTracker',
194
+ init_score_thr=0.0001,
195
+ obj_score_thr=0.0001,
196
+ match_score_thr=0.5,
197
+ memo_tracklet_frames=10,
198
+ memo_momentum=0.8,
199
+ with_cats=False,
200
+ max_distance=-1,
201
+ fps=1,
202
+ )
203
+ )
204
+
205
+ train_dataloader = None
206
+ train_cfg = None
207
+ val_cfg = dict(type='ValLoop')
208
+ test_cfg = dict(type='TestLoop')
209
+
210
+ default_hooks = dict(
211
+ logger=dict(type='LoggerHook', interval=50),
212
+ visualization=dict(type='TrackVisualizationHook', draw=False))
213
+
214
+ vis_backends = [dict(type='LocalVisBackend')]
215
+ visualizer = dict(
216
+ type='MasaTrackLocalVisualizer', vis_backends=vis_backends, name='visualizer')
217
+
218
+ # custom hooks
219
+ custom_hooks = [
220
+ # Synchronize model buffers such as running_mean and running_var in BN
221
+ # at the end of each epoch
222
+ dict(type='SyncBuffersHook')
223
+ ]
224
+ auto_scale_lr = dict(enable=False, base_batch_size=16)
225
+ val_dataloader = dict(
226
+ dataset=dict(
227
+ ann_file='data/tao/annotations/tao_val_lvis_v1_classes.json',
228
+ )
229
+ )
230
+ test_dataloader = val_dataloader
231
+ val_evaluator = dict(
232
+ ann_file='data/tao/annotations/tao_val_lvis_v1_classes.json',
233
+ outfile_prefix='results/masa_results/masa-detic-release-ovmot-test',
234
+ open_vocabulary=True,
235
+ )
236
+ test_evaluator = val_evaluator
configs/masa-detic/tao_teta_test/masa_detic_swinb_tao_test_detic_dets.py ADDED
@@ -0,0 +1,219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _base_ = [
2
+ '../../../projects/Detic_new/configs/detic_centernet2_swin-b_fpn_4x_lvis-base_in21k-lvis-masa.py',
3
+ '../../datasets/tao/tao_dataset_v1.py',
4
+ '../../default_runtime.py'
5
+ ]
6
+ default_scope = 'mmdet'
7
+ detector = _base_.model
8
+ detector.pop('data_preprocessor')
9
+ detector['init_cfg'] = dict(
10
+ type='Pretrained',
11
+ checkpoint= 'saved_models/tsa_models/detic_centernet2_swin-b_fpn_4x_lvis-base_in21k-lvis-ec91245d.pth'
12
+ # noqa: E501
13
+ )
14
+ detector['type'] = 'DeticMasa'
15
+
16
+ del _base_.model
17
+
18
+ model = dict(
19
+ type='MASA',
20
+ freeze_detector=True,
21
+ unified_backbone=True,
22
+ load_public_dets = True,
23
+ benchmark = 'tao',
24
+ public_det_path = 'results/public_dets/tao_val_dets/teta_50_internms/detic_tao_val_det/',
25
+ data_preprocessor=dict(
26
+ type='TrackDataPreprocessor',
27
+ # Image normalization parameters
28
+ mean=[123.675, 116.28, 103.53],
29
+ std=[58.395, 57.12, 57.375],
30
+ bgr_to_rgb=True,
31
+ # Image padding parameters
32
+ pad_mask=True, # In instance segmentation, the mask needs to be padded
33
+ pad_size_divisor=32), # Padding the image to multiples of 32
34
+ detector=detector,
35
+ masa_adapter=[
36
+ dict(
37
+ type='FPN',
38
+ in_channels=[256, 512, 1024],
39
+ out_channels=256,
40
+ norm_cfg=dict(type='SyncBN', requires_grad=True),
41
+ num_outs=5),
42
+ dict(
43
+ type='DeformFusion',
44
+ in_channels=256,
45
+ out_channels=256,
46
+ num_blocks=3)],
47
+ rpn_head=dict(
48
+ type='RPNHead',
49
+ in_channels=256,
50
+ feat_channels=256,
51
+ anchor_generator=dict(
52
+ type='AnchorGenerator',
53
+ scales=[8],
54
+ ratios=[0.5, 1.0, 2.0],
55
+ strides=[8, 16, 32, 64, 128]),
56
+ bbox_coder=dict(
57
+ type='DeltaXYWHBBoxCoder',
58
+ target_means=[.0, .0, .0, .0],
59
+ target_stds=[1.0, 1.0, 1.0, 1.0]),
60
+ loss_cls=dict(
61
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
62
+ loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)
63
+ ),
64
+ roi_head=dict(
65
+ type='StandardRoIHead',
66
+ bbox_roi_extractor=dict(
67
+ type='SingleRoIExtractor',
68
+ roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
69
+ out_channels=256,
70
+ featmap_strides=[8, 16, 32]),
71
+ bbox_head=dict(
72
+ type='Shared2FCBBoxHead',
73
+ in_channels=256,
74
+ fc_out_channels=1024,
75
+ roi_feat_size=7,
76
+ num_classes=1,
77
+ bbox_coder=dict(
78
+ type='DeltaXYWHBBoxCoder',
79
+ target_means=[0., 0., 0., 0.],
80
+ target_stds=[0.1, 0.1, 0.2, 0.2]),
81
+ reg_class_agnostic=True,
82
+ loss_cls=dict(
83
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
84
+ loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
85
+ # model training and testing settings
86
+ train_cfg=dict(
87
+ rpn=dict(
88
+ assigner=dict(
89
+ type='MaxIoUAssigner',
90
+ pos_iou_thr=0.7,
91
+ neg_iou_thr=0.3,
92
+ min_pos_iou=0.3,
93
+ match_low_quality=True,
94
+ ignore_iof_thr=-1),
95
+ sampler=dict(
96
+ type='RandomSampler',
97
+ num=256,
98
+ pos_fraction=0.5,
99
+ neg_pos_ub=-1,
100
+ add_gt_as_proposals=False),
101
+ allowed_border=-1,
102
+ pos_weight=-1,
103
+ debug=False),
104
+ rpn_proposal=dict(
105
+ nms_pre=2000,
106
+ max_per_img=1000,
107
+ nms=dict(type='nms', iou_threshold=0.7),
108
+ min_bbox_size=0),
109
+ rcnn=dict(
110
+ assigner=dict(
111
+ type='MaxIoUAssigner',
112
+ pos_iou_thr=0.5,
113
+ neg_iou_thr=0.5,
114
+ min_pos_iou=0.5,
115
+ match_low_quality=False,
116
+ ignore_iof_thr=-1),
117
+ sampler=dict(
118
+ type='RandomSampler',
119
+ num=512,
120
+ pos_fraction=0.25,
121
+ neg_pos_ub=-1,
122
+ add_gt_as_proposals=True),
123
+ pos_weight=-1,
124
+ debug=False)),
125
+ test_cfg=dict(
126
+ rpn=dict(
127
+ nms_pre=1000,
128
+ max_per_img=1000,
129
+ nms=dict(type='nms', iou_threshold=0.7),
130
+ min_bbox_size=0),
131
+ rcnn=dict(
132
+ score_thr=0.05,
133
+ nms=dict(type='nms', iou_threshold=0.5),
134
+ max_per_img=100)
135
+ # soft-nms is also supported for rcnn testing
136
+ # e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05)
137
+ ),
138
+ track_head=dict(
139
+ type='MasaTrackHead',
140
+ roi_extractor=dict(
141
+ type='SingleRoIExtractor',
142
+ roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
143
+ out_channels=256,
144
+ featmap_strides=[8, 16, 32]),
145
+ embed_head=dict(
146
+ type='QuasiDenseEmbedHead',
147
+ num_convs=4,
148
+ num_fcs=1,
149
+ embed_channels=256,
150
+ norm_cfg=dict(type='GN', num_groups=32),
151
+ loss_track=dict(type='UnbiasedContrastLoss', loss_weight=0.25),
152
+ loss_track_aux=dict(
153
+ type='MarginL2Loss',
154
+ neg_pos_ub=3,
155
+ pos_margin=0,
156
+ neg_margin=0.1,
157
+ hard_mining=True,
158
+ loss_weight=1.0)),
159
+ # loss_bbox=dict(type='L1Loss', loss_weight=1.0),
160
+ train_cfg=dict(
161
+ assigner=dict(
162
+ type='MaxIoUAssigner',
163
+ pos_iou_thr=0.7,
164
+ neg_iou_thr=0.5,
165
+ min_pos_iou=0.5,
166
+ match_low_quality=False,
167
+ ignore_iof_thr=-1),
168
+ sampler=dict(
169
+ type='CombinedSampler',
170
+ num=512,
171
+ pos_fraction=0.8,
172
+ neg_pos_ub=3,
173
+ add_gt_as_proposals=True,
174
+ pos_sampler=dict(type='InstanceBalancedPosSampler'),
175
+ neg_sampler=dict(type='RandomSampler')))),
176
+ tracker=dict(
177
+ type='MasaTaoTracker',
178
+ init_score_thr=0.0001,
179
+ obj_score_thr=0.0001,
180
+ match_score_thr=0.5,
181
+ memo_tracklet_frames=10,
182
+ memo_momentum=0.8,
183
+ with_cats=False,
184
+ max_distance=-1,
185
+ fps=1,
186
+ )
187
+ )
188
+
189
+ train_dataloader = None
190
+ train_cfg = None
191
+ val_cfg = dict(type='ValLoop')
192
+ test_cfg = dict(type='TestLoop')
193
+
194
+ default_hooks = dict(
195
+ logger=dict(type='LoggerHook', interval=50),
196
+ visualization=dict(type='TrackVisualizationHook', draw=False))
197
+
198
+ vis_backends = [dict(type='LocalVisBackend')]
199
+ visualizer = dict(
200
+ type='MasaTrackLocalVisualizer', vis_backends=vis_backends, name='visualizer')
201
+
202
+ # custom hooks
203
+ custom_hooks = [
204
+ # Synchronize model buffers such as running_mean and running_var in BN
205
+ # at the end of each epoch
206
+ dict(type='SyncBuffersHook')
207
+ ]
208
+ auto_scale_lr = dict(enable=False, base_batch_size=16)
209
+ val_dataloader = dict(
210
+ dataset=dict(
211
+ ann_file='data/tao/annotations/tao_val_lvis_v1_classes.json',
212
+ )
213
+ )
214
+ test_dataloader = val_dataloader
215
+ val_evaluator = dict(
216
+ ann_file='data/tao/annotations/tao_val_lvis_v1_classes.json',
217
+ outfile_prefix='results/masa_results/masa-detic-release-detic-dets-tao-test',
218
+ )
219
+ test_evaluator = val_evaluator
configs/masa-detic/tao_teta_test/masa_detic_swinb_tao_test_teter_swinT_dets.py ADDED
@@ -0,0 +1,219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _base_ = [
2
+ '../../../projects/Detic_new/configs/detic_centernet2_swin-b_fpn_4x_lvis-base_in21k-lvis-masa.py',
3
+ '../../datasets/tao/tao_dataset_v05.py',
4
+ '../../default_runtime.py'
5
+ ]
6
+ default_scope = 'mmdet'
7
+ detector = _base_.model
8
+ detector.pop('data_preprocessor')
9
+ detector['init_cfg'] = dict(
10
+ type='Pretrained',
11
+ checkpoint= 'saved_models/tsa_models/detic_centernet2_swin-b_fpn_4x_lvis-base_in21k-lvis-ec91245d.pth'
12
+ # noqa: E501
13
+ )
14
+ detector['type'] = 'DeticMasa'
15
+
16
+ del _base_.model
17
+
18
+ model = dict(
19
+ type='MASA',
20
+ freeze_detector=True,
21
+ unified_backbone=True,
22
+ load_public_dets = True,
23
+ benchmark='tao',
24
+ public_det_path = 'results/public_dets/tao_val_dets/teta_50_internms/teter_swinT_tao_val_internms_50/',
25
+ data_preprocessor=dict(
26
+ type='TrackDataPreprocessor',
27
+ # Image normalization parameters
28
+ mean=[123.675, 116.28, 103.53],
29
+ std=[58.395, 57.12, 57.375],
30
+ bgr_to_rgb=True,
31
+ # Image padding parameters
32
+ pad_mask=True, # In instance segmentation, the mask needs to be padded
33
+ pad_size_divisor=32), # Padding the image to multiples of 32
34
+ detector=detector,
35
+ masa_adapter=[
36
+ dict(
37
+ type='FPN',
38
+ in_channels=[256, 512, 1024],
39
+ out_channels=256,
40
+ norm_cfg=dict(type='SyncBN', requires_grad=True),
41
+ num_outs=5),
42
+ dict(
43
+ type='DeformFusion',
44
+ in_channels=256,
45
+ out_channels=256,
46
+ num_blocks=3)],
47
+ rpn_head=dict(
48
+ type='RPNHead',
49
+ in_channels=256,
50
+ feat_channels=256,
51
+ anchor_generator=dict(
52
+ type='AnchorGenerator',
53
+ scales=[8],
54
+ ratios=[0.5, 1.0, 2.0],
55
+ strides=[8, 16, 32, 64, 128]),
56
+ bbox_coder=dict(
57
+ type='DeltaXYWHBBoxCoder',
58
+ target_means=[.0, .0, .0, .0],
59
+ target_stds=[1.0, 1.0, 1.0, 1.0]),
60
+ loss_cls=dict(
61
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
62
+ loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)
63
+ ),
64
+ roi_head=dict(
65
+ type='StandardRoIHead',
66
+ bbox_roi_extractor=dict(
67
+ type='SingleRoIExtractor',
68
+ roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
69
+ out_channels=256,
70
+ featmap_strides=[8, 16, 32]),
71
+ bbox_head=dict(
72
+ type='Shared2FCBBoxHead',
73
+ in_channels=256,
74
+ fc_out_channels=1024,
75
+ roi_feat_size=7,
76
+ num_classes=1,
77
+ bbox_coder=dict(
78
+ type='DeltaXYWHBBoxCoder',
79
+ target_means=[0., 0., 0., 0.],
80
+ target_stds=[0.1, 0.1, 0.2, 0.2]),
81
+ reg_class_agnostic=True,
82
+ loss_cls=dict(
83
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
84
+ loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
85
+ # model training and testing settings
86
+ train_cfg=dict(
87
+ rpn=dict(
88
+ assigner=dict(
89
+ type='MaxIoUAssigner',
90
+ pos_iou_thr=0.7,
91
+ neg_iou_thr=0.3,
92
+ min_pos_iou=0.3,
93
+ match_low_quality=True,
94
+ ignore_iof_thr=-1),
95
+ sampler=dict(
96
+ type='RandomSampler',
97
+ num=256,
98
+ pos_fraction=0.5,
99
+ neg_pos_ub=-1,
100
+ add_gt_as_proposals=False),
101
+ allowed_border=-1,
102
+ pos_weight=-1,
103
+ debug=False),
104
+ rpn_proposal=dict(
105
+ nms_pre=2000,
106
+ max_per_img=1000,
107
+ nms=dict(type='nms', iou_threshold=0.7),
108
+ min_bbox_size=0),
109
+ rcnn=dict(
110
+ assigner=dict(
111
+ type='MaxIoUAssigner',
112
+ pos_iou_thr=0.5,
113
+ neg_iou_thr=0.5,
114
+ min_pos_iou=0.5,
115
+ match_low_quality=False,
116
+ ignore_iof_thr=-1),
117
+ sampler=dict(
118
+ type='RandomSampler',
119
+ num=512,
120
+ pos_fraction=0.25,
121
+ neg_pos_ub=-1,
122
+ add_gt_as_proposals=True),
123
+ pos_weight=-1,
124
+ debug=False)),
125
+ test_cfg=dict(
126
+ rpn=dict(
127
+ nms_pre=1000,
128
+ max_per_img=1000,
129
+ nms=dict(type='nms', iou_threshold=0.7),
130
+ min_bbox_size=0),
131
+ rcnn=dict(
132
+ score_thr=0.05,
133
+ nms=dict(type='nms', iou_threshold=0.5),
134
+ max_per_img=100)
135
+ # soft-nms is also supported for rcnn testing
136
+ # e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05)
137
+ ),
138
+ track_head=dict(
139
+ type='MasaTrackHead',
140
+ roi_extractor=dict(
141
+ type='SingleRoIExtractor',
142
+ roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
143
+ out_channels=256,
144
+ featmap_strides=[8, 16, 32]),
145
+ embed_head=dict(
146
+ type='QuasiDenseEmbedHead',
147
+ num_convs=4,
148
+ num_fcs=1,
149
+ embed_channels=256,
150
+ norm_cfg=dict(type='GN', num_groups=32),
151
+ loss_track=dict(type='UnbiasedContrastLoss', loss_weight=0.25),
152
+ loss_track_aux=dict(
153
+ type='MarginL2Loss',
154
+ neg_pos_ub=3,
155
+ pos_margin=0,
156
+ neg_margin=0.1,
157
+ hard_mining=True,
158
+ loss_weight=1.0)),
159
+ # loss_bbox=dict(type='L1Loss', loss_weight=1.0),
160
+ train_cfg=dict(
161
+ assigner=dict(
162
+ type='MaxIoUAssigner',
163
+ pos_iou_thr=0.7,
164
+ neg_iou_thr=0.5,
165
+ min_pos_iou=0.5,
166
+ match_low_quality=False,
167
+ ignore_iof_thr=-1),
168
+ sampler=dict(
169
+ type='CombinedSampler',
170
+ num=512,
171
+ pos_fraction=0.8,
172
+ neg_pos_ub=3,
173
+ add_gt_as_proposals=True,
174
+ pos_sampler=dict(type='InstanceBalancedPosSampler'),
175
+ neg_sampler=dict(type='RandomSampler')))),
176
+ tracker=dict(
177
+ type='MasaTaoTracker',
178
+ init_score_thr=0.0001,
179
+ obj_score_thr=0.0001,
180
+ match_score_thr=0.5,
181
+ memo_tracklet_frames=10,
182
+ memo_momentum=0.8,
183
+ with_cats=False,
184
+ max_distance=-1,
185
+ fps=1,
186
+ )
187
+ )
188
+
189
+ train_dataloader = None
190
+ train_cfg = None
191
+ val_cfg = dict(type='ValLoop')
192
+ test_cfg = dict(type='TestLoop')
193
+
194
+ default_hooks = dict(
195
+ logger=dict(type='LoggerHook', interval=50),
196
+ visualization=dict(type='TrackVisualizationHook', draw=False))
197
+
198
+ vis_backends = [dict(type='LocalVisBackend')]
199
+ visualizer = dict(
200
+ type='MasaTrackLocalVisualizer', vis_backends=vis_backends, name='visualizer')
201
+
202
+ # custom hooks
203
+ custom_hooks = [
204
+ # Synchronize model buffers such as running_mean and running_var in BN
205
+ # at the end of each epoch
206
+ dict(type='SyncBuffersHook')
207
+ ]
208
+ auto_scale_lr = dict(enable=False, base_batch_size=16)
209
+ val_dataloader = dict(
210
+ dataset=dict(
211
+ ann_file='data/tao/annotations/tao_val_lvis_v05_classes.json'
212
+ )
213
+ )
214
+ test_dataloader = val_dataloader
215
+ val_evaluator = dict(
216
+ ann_file='data/tao/annotations/tao_val_lvis_v05_classes.json',
217
+ outfile_prefix='results/masa_results/masa-detic-release-test',
218
+ )
219
+ test_evaluator = val_evaluator
configs/masa-gdino/bdd_test/masa_gdino_bdd_mot_test.py ADDED
@@ -0,0 +1,226 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _base_ = [
2
+ '../../../projects/grounding_dino/grounding_dino_swin-b_pretrain_mixeddata_masa.py',
3
+ '../../datasets/bdd/bdd_dataset.py',
4
+ '../../default_runtime.py'
5
+ ]
6
+ default_scope = 'mmdet'
7
+ detector = _base_.model
8
+ # detector.backbone.update(dict(out_indices=(1, 2, 3)))
9
+ detector.pop('data_preprocessor')
10
+ detector['init_cfg'] = dict(
11
+ type='Pretrained',
12
+ checkpoint= 'saved_models/tsa_models/groundingdino_swinb_cogcoor_mmdet-55949c9c.pth'
13
+ # noqa: E501
14
+ )
15
+ detector['type'] = 'GroundingDINOMasa'
16
+
17
+ del _base_.model
18
+
19
+ model = dict(
20
+ type='MASA',
21
+ freeze_detector=True,
22
+ unified_backbone=True,
23
+ load_public_dets = True,
24
+ benchmark = 'bdd',
25
+ public_det_path = 'results/public_dets/bdd_mot_yolox_dets/',
26
+ data_preprocessor=dict(
27
+ type='TrackDataPreprocessor',
28
+ # Image normalization parameters
29
+ mean=[123.675, 116.28, 103.53],
30
+ std=[58.395, 57.12, 57.375],
31
+ bgr_to_rgb=True,
32
+ # Image padding parameters
33
+ pad_mask=False, # In instance segmentation, the mask needs to be padded
34
+ pad_size_divisor=1024, # Padding the image to multiples of 32
35
+ ),
36
+ detector=detector,
37
+ masa_adapter=[
38
+ dict(
39
+ type='FPN',
40
+ in_channels=[256, 512, 1024],
41
+ out_channels=256,
42
+ norm_cfg=dict(type='SyncBN', requires_grad=True),
43
+ num_outs=5),
44
+ dict(
45
+ type='DeformFusion',
46
+ in_channels=256,
47
+ out_channels=256,
48
+ num_blocks=3)],
49
+ rpn_head=dict(
50
+ type='RPNHead',
51
+ in_channels=256,
52
+ feat_channels=256,
53
+ anchor_generator=dict(
54
+ type='AnchorGenerator',
55
+ scales=[8],
56
+ ratios=[0.5, 1.0, 2.0],
57
+ strides=[8, 16, 32, 64, 128]),
58
+ bbox_coder=dict(
59
+ type='DeltaXYWHBBoxCoder',
60
+ target_means=[.0, .0, .0, .0],
61
+ target_stds=[1.0, 1.0, 1.0, 1.0]),
62
+ loss_cls=dict(
63
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
64
+ loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)
65
+ ),
66
+ roi_head=dict(
67
+ type='StandardRoIHead',
68
+ bbox_roi_extractor=dict(
69
+ type='SingleRoIExtractor',
70
+ roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
71
+ out_channels=256,
72
+ featmap_strides=[8, 16, 32]),
73
+ bbox_head=dict(
74
+ type='Shared2FCBBoxHead',
75
+ in_channels=256,
76
+ fc_out_channels=1024,
77
+ roi_feat_size=7,
78
+ num_classes=1,
79
+ bbox_coder=dict(
80
+ type='DeltaXYWHBBoxCoder',
81
+ target_means=[0., 0., 0., 0.],
82
+ target_stds=[0.1, 0.1, 0.2, 0.2]),
83
+ reg_class_agnostic=True,
84
+ loss_cls=dict(
85
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
86
+ loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
87
+ # model training and testing settings
88
+ train_cfg=dict(
89
+ rpn=dict(
90
+ assigner=dict(
91
+ type='MaxIoUAssigner',
92
+ pos_iou_thr=0.7,
93
+ neg_iou_thr=0.3,
94
+ min_pos_iou=0.3,
95
+ match_low_quality=True,
96
+ ignore_iof_thr=-1),
97
+ sampler=dict(
98
+ type='RandomSampler',
99
+ num=256,
100
+ pos_fraction=0.5,
101
+ neg_pos_ub=-1,
102
+ add_gt_as_proposals=False),
103
+ allowed_border=-1,
104
+ pos_weight=-1,
105
+ debug=False),
106
+ rpn_proposal=dict(
107
+ nms_pre=2000,
108
+ max_per_img=1000,
109
+ nms=dict(type='nms', iou_threshold=0.7),
110
+ min_bbox_size=0),
111
+ rcnn=dict(
112
+ assigner=dict(
113
+ type='MaxIoUAssigner',
114
+ pos_iou_thr=0.5,
115
+ neg_iou_thr=0.5,
116
+ min_pos_iou=0.5,
117
+ match_low_quality=False,
118
+ ignore_iof_thr=-1),
119
+ sampler=dict(
120
+ type='RandomSampler',
121
+ num=512,
122
+ pos_fraction=0.25,
123
+ neg_pos_ub=-1,
124
+ add_gt_as_proposals=True),
125
+ pos_weight=-1,
126
+ debug=False)),
127
+ test_cfg=dict(
128
+ rpn=dict(
129
+ nms_pre=1000,
130
+ max_per_img=1000,
131
+ nms=dict(type='nms', iou_threshold=0.7),
132
+ min_bbox_size=0),
133
+ rcnn=dict(
134
+ score_thr=0.02,
135
+ # nms=dict(type='nms', iou_threshold=0.5),
136
+ nms=dict(type='nms',
137
+ iou_threshold=0.5,
138
+ class_agnostic=True,
139
+ split_thr=100000),
140
+ max_per_img=50,
141
+ mask_thr_binary=0.5)
142
+ # soft-nms is also supported for rcnn testing
143
+ # e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05)
144
+ ),
145
+ track_head=dict(
146
+ type='MasaTrackHead',
147
+ roi_extractor=dict(
148
+ type='SingleRoIExtractor',
149
+ roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
150
+ out_channels=256,
151
+ featmap_strides=[8, 16, 32]),
152
+ embed_head=dict(
153
+ type='QuasiDenseEmbedHead',
154
+ num_convs=4,
155
+ num_fcs=1,
156
+ embed_channels=256,
157
+ norm_cfg=dict(type='GN', num_groups=32),
158
+ loss_track=dict(type='UnbiasedContrastLoss', loss_weight=0.25),
159
+ loss_track_aux=dict(
160
+ type='MarginL2Loss',
161
+ neg_pos_ub=3,
162
+ pos_margin=0,
163
+ neg_margin=0.1,
164
+ hard_mining=True,
165
+ loss_weight=1.0)),
166
+ # loss_bbox=dict(type='L1Loss', loss_weight=1.0),
167
+ train_cfg=dict(
168
+ assigner=dict(
169
+ type='MaxIoUAssigner',
170
+ pos_iou_thr=0.7,
171
+ neg_iou_thr=0.3,
172
+ min_pos_iou=0.5,
173
+ match_low_quality=False,
174
+ ignore_iof_thr=-1),
175
+ sampler=dict(
176
+ type='CombinedSampler',
177
+ num=512,
178
+ pos_fraction=0.5,
179
+ neg_pos_ub=3,
180
+ add_gt_as_proposals=True,
181
+ pos_sampler=dict(type='InstanceBalancedPosSampler'),
182
+ neg_sampler=dict(type='RandomSampler')))),
183
+ tracker=dict(
184
+ type='MasaBDDTracker',
185
+ init_score_thr=0.5,
186
+ obj_score_thr=0.3,
187
+ match_score_thr=0.6,
188
+ memo_tracklet_frames=10,
189
+ memo_backdrop_frames=1,
190
+ memo_momentum=0.8,
191
+ nms_conf_thr=0.5,
192
+ nms_backdrop_iou_thr=0.3,
193
+ nms_class_iou_thr=0.7,
194
+ with_cats=False,
195
+ match_metric='bisoftmax')
196
+ )
197
+
198
+ # runtime settings
199
+ train_dataloader = None
200
+ train_cfg = None
201
+ val_cfg = dict(type='ValLoop')
202
+ test_cfg = dict(type='TestLoop')
203
+
204
+ default_hooks = dict(
205
+ logger=dict(type='LoggerHook', interval=50),
206
+ visualization=dict(type='TrackVisualizationHook', draw=False),
207
+ checkpoint = dict(type='CheckpointHook', interval=1),
208
+ )
209
+
210
+ vis_backends = [dict(type='LocalVisBackend')]
211
+ visualizer = dict(
212
+ type='MasaTrackLocalVisualizer', vis_backends=vis_backends, name='visualizer')
213
+
214
+ val_dataloader = dict(
215
+ dataset=dict(
216
+ ann_file='data/bdd/annotations/box_track_20/box_track_val_cocofmt.json',
217
+ )
218
+ )
219
+ test_dataloader = val_dataloader
220
+ val_evaluator = dict(
221
+ ann_file='data/bdd/annotations/box_track_20/box_track_val_cocofmt.json',
222
+ scalabel_gt='data/bdd/annotations/scalabel_gt/box_track_20/val/',
223
+ outfile_prefix='results/detic_masa_trained_bdd_demo',
224
+ metric=['TETA', 'HOTA', 'CLEAR']
225
+ )
226
+ test_evaluator = val_evaluator
configs/masa-gdino/bdd_test/masa_gdino_bdd_mots_test.py ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _base_ = [
2
+ '../../../projects/grounding_dino/grounding_dino_swin-b_pretrain_mixeddata_masa.py',
3
+ '../../datasets/bdd/bdd_dataset.py',
4
+ '../../default_runtime.py'
5
+ ]
6
+ default_scope = 'mmdet'
7
+ detector = _base_.model
8
+ detector.pop('data_preprocessor')
9
+ detector['init_cfg'] = dict(
10
+ type='Pretrained',
11
+ checkpoint= 'saved_models/tsa_models/groundingdino_swinb_cogcoor_mmdet-55949c9c.pth'
12
+ # noqa: E501
13
+ )
14
+ detector['type'] = 'GroundingDINOMasa'
15
+
16
+ del _base_.model
17
+
18
+ model = dict(
19
+ type='MASA',
20
+ freeze_detector=True,
21
+ unified_backbone=True,
22
+ load_public_dets = True,
23
+ with_segm=True,
24
+ benchmark = 'bdd',
25
+ public_det_path = 'results/public_dets/bdd_mots_val_uninext_dets/',
26
+ data_preprocessor=dict(
27
+ type='TrackDataPreprocessor',
28
+ # Image normalization parameters
29
+ mean=[123.675, 116.28, 103.53],
30
+ std=[58.395, 57.12, 57.375],
31
+ bgr_to_rgb=True,
32
+ # Image padding parameters
33
+ pad_mask=False, # In instance segmentation, the mask needs to be padded
34
+ pad_size_divisor=1024, # Padding the image to multiples of 32
35
+ ),
36
+ detector=detector,
37
+ masa_adapter=[
38
+ dict(
39
+ type='FPN',
40
+ in_channels=[256, 512, 1024],
41
+ out_channels=256,
42
+ norm_cfg=dict(type='SyncBN', requires_grad=True),
43
+ num_outs=5),
44
+ dict(
45
+ type='DeformFusion',
46
+ in_channels=256,
47
+ out_channels=256,
48
+ num_blocks=3)],
49
+ rpn_head=dict(
50
+ type='RPNHead',
51
+ in_channels=256,
52
+ feat_channels=256,
53
+ anchor_generator=dict(
54
+ type='AnchorGenerator',
55
+ scales=[8],
56
+ ratios=[0.5, 1.0, 2.0],
57
+ strides=[8, 16, 32, 64, 128]),
58
+ bbox_coder=dict(
59
+ type='DeltaXYWHBBoxCoder',
60
+ target_means=[.0, .0, .0, .0],
61
+ target_stds=[1.0, 1.0, 1.0, 1.0]),
62
+ loss_cls=dict(
63
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
64
+ loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)
65
+ ),
66
+ roi_head=dict(
67
+ type='StandardRoIHead',
68
+ bbox_roi_extractor=dict(
69
+ type='SingleRoIExtractor',
70
+ roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
71
+ out_channels=256,
72
+ featmap_strides=[8, 16, 32]),
73
+ bbox_head=dict(
74
+ type='Shared2FCBBoxHead',
75
+ in_channels=256,
76
+ fc_out_channels=1024,
77
+ roi_feat_size=7,
78
+ num_classes=1,
79
+ bbox_coder=dict(
80
+ type='DeltaXYWHBBoxCoder',
81
+ target_means=[0., 0., 0., 0.],
82
+ target_stds=[0.1, 0.1, 0.2, 0.2]),
83
+ reg_class_agnostic=True,
84
+ loss_cls=dict(
85
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
86
+ loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
87
+ # model training and testing settings
88
+ train_cfg=dict(
89
+ rpn=dict(
90
+ assigner=dict(
91
+ type='MaxIoUAssigner',
92
+ pos_iou_thr=0.7,
93
+ neg_iou_thr=0.3,
94
+ min_pos_iou=0.3,
95
+ match_low_quality=True,
96
+ ignore_iof_thr=-1),
97
+ sampler=dict(
98
+ type='RandomSampler',
99
+ num=256,
100
+ pos_fraction=0.5,
101
+ neg_pos_ub=-1,
102
+ add_gt_as_proposals=False),
103
+ allowed_border=-1,
104
+ pos_weight=-1,
105
+ debug=False),
106
+ rpn_proposal=dict(
107
+ nms_pre=2000,
108
+ max_per_img=1000,
109
+ nms=dict(type='nms', iou_threshold=0.7),
110
+ min_bbox_size=0),
111
+ rcnn=dict(
112
+ assigner=dict(
113
+ type='MaxIoUAssigner',
114
+ pos_iou_thr=0.5,
115
+ neg_iou_thr=0.5,
116
+ min_pos_iou=0.5,
117
+ match_low_quality=False,
118
+ ignore_iof_thr=-1),
119
+ sampler=dict(
120
+ type='RandomSampler',
121
+ num=512,
122
+ pos_fraction=0.25,
123
+ neg_pos_ub=-1,
124
+ add_gt_as_proposals=True),
125
+ pos_weight=-1,
126
+ debug=False)),
127
+ test_cfg=dict(
128
+ rpn=dict(
129
+ nms_pre=1000,
130
+ max_per_img=1000,
131
+ nms=dict(type='nms', iou_threshold=0.7),
132
+ min_bbox_size=0),
133
+ rcnn=dict(
134
+ score_thr=0.02,
135
+ # nms=dict(type='nms', iou_threshold=0.5),
136
+ nms=dict(type='nms',
137
+ iou_threshold=0.5,
138
+ class_agnostic=True,
139
+ split_thr=100000),
140
+ max_per_img=50,
141
+ mask_thr_binary=0.5)
142
+ # soft-nms is also supported for rcnn testing
143
+ # e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05)
144
+ ),
145
+ track_head=dict(
146
+ type='MasaTrackHead',
147
+ roi_extractor=dict(
148
+ type='SingleRoIExtractor',
149
+ roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
150
+ out_channels=256,
151
+ featmap_strides=[8, 16, 32]),
152
+ embed_head=dict(
153
+ type='QuasiDenseEmbedHead',
154
+ num_convs=4,
155
+ num_fcs=1,
156
+ embed_channels=256,
157
+ norm_cfg=dict(type='GN', num_groups=32),
158
+ loss_track=dict(type='UnbiasedContrastLoss', loss_weight=0.25),
159
+ loss_track_aux=dict(
160
+ type='MarginL2Loss',
161
+ neg_pos_ub=3,
162
+ pos_margin=0,
163
+ neg_margin=0.1,
164
+ hard_mining=True,
165
+ loss_weight=1.0)),
166
+ # loss_bbox=dict(type='L1Loss', loss_weight=1.0),
167
+ train_cfg=dict(
168
+ assigner=dict(
169
+ type='MaxIoUAssigner',
170
+ pos_iou_thr=0.7,
171
+ neg_iou_thr=0.3,
172
+ min_pos_iou=0.5,
173
+ match_low_quality=False,
174
+ ignore_iof_thr=-1),
175
+ sampler=dict(
176
+ type='CombinedSampler',
177
+ num=512,
178
+ pos_fraction=0.5,
179
+ neg_pos_ub=3,
180
+ add_gt_as_proposals=True,
181
+ pos_sampler=dict(type='InstanceBalancedPosSampler'),
182
+ neg_sampler=dict(type='RandomSampler')))),
183
+ tracker=dict(
184
+ type='MasaBDDTracker',
185
+ init_score_thr=0.5,
186
+ obj_score_thr=0.3,
187
+ match_score_thr=0.6,
188
+ memo_tracklet_frames=10,
189
+ memo_backdrop_frames=1,
190
+ memo_momentum=0.8,
191
+ nms_conf_thr=0.5,
192
+ nms_backdrop_iou_thr=0.3,
193
+ nms_class_iou_thr=0.7,
194
+ with_cats=False,
195
+ match_metric='bisoftmax')
196
+ )
197
+
198
+ # runtime settings
199
+ train_dataloader = None
200
+ train_cfg = None
201
+ val_cfg = dict(type='ValLoop')
202
+ test_cfg = dict(type='TestLoop')
203
+
204
+ default_hooks = dict(
205
+ logger=dict(type='LoggerHook', interval=50),
206
+ visualization=dict(type='TrackVisualizationHook', draw=False),
207
+ checkpoint = dict(type='CheckpointHook', interval=1),
208
+ )
209
+
210
+ vis_backends = [dict(type='LocalVisBackend')]
211
+ visualizer = dict(
212
+ type='MasaTrackLocalVisualizer', vis_backends=vis_backends, name='visualizer')
213
+
214
+ val_dataloader = dict(
215
+ dataset=dict(
216
+ ann_file='data/bdd/annotations/seg_track_val_cocofmt.json',
217
+ )
218
+ )
219
+ test_dataloader = val_dataloader
220
+ val_evaluator = dict(
221
+ ann_file='data/bdd/annotations/seg_track_val_cocofmt.json',
222
+ scalabel_gt='data/bdd/annotations/scalabel_gt/seg_track_20/val/',
223
+ outfile_prefix='results/masa_results/masa-groundingdino-release-bdd-mots-test',
224
+ metric=['TETA', 'HOTA', 'CLEAR'],
225
+ with_mask=True,
226
+ )
227
+ test_evaluator = val_evaluator
configs/masa-gdino/masa_gdino_swinb_inference.py ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _base_ = [
2
+ '../../projects/grounding_dino/grounding_dino_swin-b_pretrain_mixeddata_masa.py',
3
+ '../default_runtime.py'
4
+ ]
5
+ default_scope = 'mmdet'
6
+ detector = _base_.model
7
+ detector.pop('data_preprocessor')
8
+ detector['init_cfg'] = dict(
9
+ type='Pretrained',
10
+ checkpoint= 'saved_models/tsa_models/groundingdino_swinb_cogcoor_mmdet-55949c9c.pth'
11
+ # noqa: E501
12
+ )
13
+ detector['type'] = 'GroundingDINOMasa'
14
+
15
+ del _base_.model
16
+
17
+ model = dict(
18
+ type='MASA',
19
+ freeze_detector=True,
20
+ unified_backbone=True,
21
+ load_public_dets = False,
22
+ data_preprocessor=dict(
23
+ type='TrackDataPreprocessor',
24
+ # Image normalization parameters
25
+ mean=[123.675, 116.28, 103.53],
26
+ std=[58.395, 57.12, 57.375],
27
+ bgr_to_rgb=True,
28
+ # Image padding parameters
29
+ pad_mask=False, # In instance segmentation, the mask needs to be padded
30
+ pad_size_divisor=32), # Padding the image to multiples of 32
31
+ detector=detector,
32
+ masa_adapter=[
33
+ dict(
34
+ type='FPN',
35
+ in_channels=[256, 512, 1024],
36
+ out_channels=256,
37
+ norm_cfg=dict(type='SyncBN', requires_grad=True),
38
+ num_outs=5),
39
+ dict(
40
+ type='DeformFusion',
41
+ in_channels=256,
42
+ out_channels=256,
43
+ num_blocks=3)],
44
+ rpn_head=dict(
45
+ type='RPNHead',
46
+ in_channels=256,
47
+ feat_channels=256,
48
+ anchor_generator=dict(
49
+ type='AnchorGenerator',
50
+ scales=[8],
51
+ ratios=[0.5, 1.0, 2.0],
52
+ strides=[8, 16, 32, 64, 128]),
53
+ bbox_coder=dict(
54
+ type='DeltaXYWHBBoxCoder',
55
+ target_means=[.0, .0, .0, .0],
56
+ target_stds=[1.0, 1.0, 1.0, 1.0]),
57
+ loss_cls=dict(
58
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
59
+ loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)
60
+ ),
61
+ roi_head=dict(
62
+ type='StandardRoIHead',
63
+ bbox_roi_extractor=dict(
64
+ type='SingleRoIExtractor',
65
+ roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
66
+ out_channels=256,
67
+ featmap_strides=[8, 16, 32]),
68
+ bbox_head=dict(
69
+ type='Shared2FCBBoxHead',
70
+ in_channels=256,
71
+ fc_out_channels=1024,
72
+ roi_feat_size=7,
73
+ num_classes=1,
74
+ bbox_coder=dict(
75
+ type='DeltaXYWHBBoxCoder',
76
+ target_means=[0., 0., 0., 0.],
77
+ target_stds=[0.1, 0.1, 0.2, 0.2]),
78
+ reg_class_agnostic=True,
79
+ loss_cls=dict(
80
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
81
+ loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
82
+ # model training and testing settings
83
+ train_cfg=dict(
84
+ rpn=dict(
85
+ assigner=dict(
86
+ type='MaxIoUAssigner',
87
+ pos_iou_thr=0.7,
88
+ neg_iou_thr=0.3,
89
+ min_pos_iou=0.3,
90
+ match_low_quality=True,
91
+ ignore_iof_thr=-1),
92
+ sampler=dict(
93
+ type='RandomSampler',
94
+ num=256,
95
+ pos_fraction=0.5,
96
+ neg_pos_ub=-1,
97
+ add_gt_as_proposals=False),
98
+ allowed_border=-1,
99
+ pos_weight=-1,
100
+ debug=False),
101
+ rpn_proposal=dict(
102
+ nms_pre=2000,
103
+ max_per_img=1000,
104
+ nms=dict(type='nms', iou_threshold=0.7),
105
+ min_bbox_size=0),
106
+ rcnn=dict(
107
+ assigner=dict(
108
+ type='MaxIoUAssigner',
109
+ pos_iou_thr=0.5,
110
+ neg_iou_thr=0.5,
111
+ min_pos_iou=0.5,
112
+ match_low_quality=False,
113
+ ignore_iof_thr=-1),
114
+ sampler=dict(
115
+ type='RandomSampler',
116
+ num=512,
117
+ pos_fraction=0.25,
118
+ neg_pos_ub=-1,
119
+ add_gt_as_proposals=True),
120
+ pos_weight=-1,
121
+ debug=False)),
122
+ test_cfg=dict(
123
+ rpn=dict(
124
+ nms_pre=1000,
125
+ max_per_img=1000,
126
+ nms=dict(type='nms', iou_threshold=0.7),
127
+ min_bbox_size=0),
128
+ rcnn=dict(
129
+ score_thr=0.02,
130
+ # nms=dict(type='nms', iou_threshold=0.5),
131
+ nms=dict(type='nms',
132
+ iou_threshold=0.5,
133
+ class_agnostic=True,
134
+ split_thr=100000),
135
+ max_per_img=50,
136
+ mask_thr_binary=0.5)
137
+ # soft-nms is also supported for rcnn testing
138
+ # e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05)
139
+ ),
140
+ track_head=dict(
141
+ type='MasaTrackHead',
142
+ roi_extractor=dict(
143
+ type='SingleRoIExtractor',
144
+ roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
145
+ out_channels=256,
146
+ featmap_strides=[8, 16, 32]),
147
+ embed_head=dict(
148
+ type='QuasiDenseEmbedHead',
149
+ num_convs=4,
150
+ num_fcs=1,
151
+ embed_channels=256,
152
+ norm_cfg=dict(type='GN', num_groups=32),
153
+ loss_track=dict(type='UnbiasedContrastLoss', loss_weight=0.25),
154
+ loss_track_aux=dict(
155
+ type='MarginL2Loss',
156
+ neg_pos_ub=3,
157
+ pos_margin=0,
158
+ neg_margin=0.1,
159
+ hard_mining=True,
160
+ loss_weight=1.0)),
161
+ train_cfg=dict(
162
+ assigner=dict(
163
+ type='MaxIoUAssigner',
164
+ pos_iou_thr=0.7,
165
+ neg_iou_thr=0.3,
166
+ min_pos_iou=0.5,
167
+ match_low_quality=False,
168
+ ignore_iof_thr=-1),
169
+ sampler=dict(
170
+ type='CombinedSampler',
171
+ num=512,
172
+ pos_fraction=0.5,
173
+ neg_pos_ub=3,
174
+ add_gt_as_proposals=True,
175
+ pos_sampler=dict(type='InstanceBalancedPosSampler'),
176
+ neg_sampler=dict(type='RandomSampler')))),
177
+ tracker=dict(
178
+ type='MasaTaoTracker',
179
+ init_score_thr=0.1,
180
+ obj_score_thr=0.01,
181
+ match_score_thr=0.5,
182
+ memo_tracklet_frames=10,
183
+ memo_momentum=0.8,
184
+ with_cats=False,
185
+ max_distance=100,
186
+ fps=30,
187
+ )
188
+ )
189
+
190
+ inference_pipeline = [
191
+ dict(
192
+ type='TransformBroadcaster',
193
+ transforms=[
194
+ dict(
195
+ type='Resize',
196
+ scale=(1333, 800),
197
+ keep_ratio=True),
198
+ ]),
199
+ dict(type='PackTrackInputs')
200
+ ]
201
+
202
+ # runtime settings
203
+ train_cfg = None
204
+ val_cfg = dict(type='ValLoop')
205
+ test_cfg = dict(type='TestLoop')
206
+
207
+ default_hooks = dict(
208
+ logger=dict(type='LoggerHook', interval=50),
209
+ visualization=dict(type='TrackVisualizationHook', draw=False),
210
+ checkpoint = dict(type='CheckpointHook', interval=1),
211
+ )
212
+
213
+ vis_backends = [dict(type='LocalVisBackend')]
214
+ visualizer = dict(
215
+ type='MasaTrackLocalVisualizer', vis_backends=vis_backends, name='visualizer')
216
+
configs/masa-gdino/masa_gdino_swinb_plug_and_play.py ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _base_ = [
2
+ '../../projects/grounding_dino/grounding_dino_swin-b_pretrain_mixeddata_masa.py',
3
+ '../default_runtime.py'
4
+ ]
5
+ default_scope = 'mmdet'
6
+ detector = _base_.model
7
+ detector.pop('data_preprocessor')
8
+ detector['init_cfg'] = dict(
9
+ type='Pretrained',
10
+ checkpoint= 'saved_models/tsa_models/groundingdino_swinb_cogcoor_mmdet-55949c9c.pth'
11
+ # noqa: E501
12
+ )
13
+ detector['type'] = 'GroundingDINOMasa'
14
+
15
+ del _base_.model
16
+
17
+ model = dict(
18
+ type='MASA',
19
+ freeze_detector=True,
20
+ unified_backbone=True,
21
+ load_public_dets = False,
22
+ given_dets = True,
23
+ data_preprocessor=dict(
24
+ type='TrackDataPreprocessor',
25
+ # Image normalization parameters
26
+ mean=[123.675, 116.28, 103.53],
27
+ std=[58.395, 57.12, 57.375],
28
+ bgr_to_rgb=True,
29
+ # Image padding parameters
30
+ pad_mask=False, # In instance segmentation, the mask needs to be padded
31
+ pad_size_divisor=32), # Padding the image to multiples of 32
32
+ detector=detector,
33
+ masa_adapter=[
34
+ dict(
35
+ type='FPN',
36
+ in_channels=[256, 512, 1024],
37
+ out_channels=256,
38
+ norm_cfg=dict(type='SyncBN', requires_grad=True),
39
+ num_outs=5),
40
+ dict(
41
+ type='DeformFusion',
42
+ in_channels=256,
43
+ out_channels=256,
44
+ num_blocks=3)],
45
+ rpn_head=dict(
46
+ type='RPNHead',
47
+ in_channels=256,
48
+ feat_channels=256,
49
+ anchor_generator=dict(
50
+ type='AnchorGenerator',
51
+ scales=[8],
52
+ ratios=[0.5, 1.0, 2.0],
53
+ strides=[8, 16, 32, 64, 128]),
54
+ bbox_coder=dict(
55
+ type='DeltaXYWHBBoxCoder',
56
+ target_means=[.0, .0, .0, .0],
57
+ target_stds=[1.0, 1.0, 1.0, 1.0]),
58
+ loss_cls=dict(
59
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
60
+ loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)
61
+ ),
62
+ roi_head=dict(
63
+ type='StandardRoIHead',
64
+ bbox_roi_extractor=dict(
65
+ type='SingleRoIExtractor',
66
+ roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
67
+ out_channels=256,
68
+ featmap_strides=[8, 16, 32]),
69
+ bbox_head=dict(
70
+ type='Shared2FCBBoxHead',
71
+ in_channels=256,
72
+ fc_out_channels=1024,
73
+ roi_feat_size=7,
74
+ num_classes=1,
75
+ bbox_coder=dict(
76
+ type='DeltaXYWHBBoxCoder',
77
+ target_means=[0., 0., 0., 0.],
78
+ target_stds=[0.1, 0.1, 0.2, 0.2]),
79
+ reg_class_agnostic=True,
80
+ loss_cls=dict(
81
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
82
+ loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
83
+ # model training and testing settings
84
+ train_cfg=dict(
85
+ rpn=dict(
86
+ assigner=dict(
87
+ type='MaxIoUAssigner',
88
+ pos_iou_thr=0.7,
89
+ neg_iou_thr=0.3,
90
+ min_pos_iou=0.3,
91
+ match_low_quality=True,
92
+ ignore_iof_thr=-1),
93
+ sampler=dict(
94
+ type='RandomSampler',
95
+ num=256,
96
+ pos_fraction=0.5,
97
+ neg_pos_ub=-1,
98
+ add_gt_as_proposals=False),
99
+ allowed_border=-1,
100
+ pos_weight=-1,
101
+ debug=False),
102
+ rpn_proposal=dict(
103
+ nms_pre=2000,
104
+ max_per_img=1000,
105
+ nms=dict(type='nms', iou_threshold=0.7),
106
+ min_bbox_size=0),
107
+ rcnn=dict(
108
+ assigner=dict(
109
+ type='MaxIoUAssigner',
110
+ pos_iou_thr=0.5,
111
+ neg_iou_thr=0.5,
112
+ min_pos_iou=0.5,
113
+ match_low_quality=False,
114
+ ignore_iof_thr=-1),
115
+ sampler=dict(
116
+ type='RandomSampler',
117
+ num=512,
118
+ pos_fraction=0.25,
119
+ neg_pos_ub=-1,
120
+ add_gt_as_proposals=True),
121
+ pos_weight=-1,
122
+ debug=False)),
123
+ test_cfg=dict(
124
+ rpn=dict(
125
+ nms_pre=1000,
126
+ max_per_img=1000,
127
+ nms=dict(type='nms', iou_threshold=0.7),
128
+ min_bbox_size=0),
129
+ rcnn=dict(
130
+ score_thr=0.02,
131
+ # nms=dict(type='nms', iou_threshold=0.5),
132
+ nms=dict(type='nms',
133
+ iou_threshold=0.5,
134
+ class_agnostic=True,
135
+ split_thr=100000),
136
+ max_per_img=50,
137
+ mask_thr_binary=0.5)
138
+ # soft-nms is also supported for rcnn testing
139
+ # e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05)
140
+ ),
141
+ track_head=dict(
142
+ type='MasaTrackHead',
143
+ roi_extractor=dict(
144
+ type='SingleRoIExtractor',
145
+ roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
146
+ out_channels=256,
147
+ featmap_strides=[8, 16, 32]),
148
+ embed_head=dict(
149
+ type='QuasiDenseEmbedHead',
150
+ num_convs=4,
151
+ num_fcs=1,
152
+ embed_channels=256,
153
+ norm_cfg=dict(type='GN', num_groups=32),
154
+ loss_track=dict(type='UnbiasedContrastLoss', loss_weight=0.25),
155
+ loss_track_aux=dict(
156
+ type='MarginL2Loss',
157
+ neg_pos_ub=3,
158
+ pos_margin=0,
159
+ neg_margin=0.1,
160
+ hard_mining=True,
161
+ loss_weight=1.0)),
162
+ train_cfg=dict(
163
+ assigner=dict(
164
+ type='MaxIoUAssigner',
165
+ pos_iou_thr=0.7,
166
+ neg_iou_thr=0.3,
167
+ min_pos_iou=0.5,
168
+ match_low_quality=False,
169
+ ignore_iof_thr=-1),
170
+ sampler=dict(
171
+ type='CombinedSampler',
172
+ num=512,
173
+ pos_fraction=0.5,
174
+ neg_pos_ub=3,
175
+ add_gt_as_proposals=True,
176
+ pos_sampler=dict(type='InstanceBalancedPosSampler'),
177
+ neg_sampler=dict(type='RandomSampler')))),
178
+ tracker=dict(
179
+ type='MasaTaoTracker',
180
+ init_score_thr=0.1,
181
+ obj_score_thr=0.01,
182
+ match_score_thr=0.5,
183
+ memo_tracklet_frames=10,
184
+ memo_momentum=0.8,
185
+ with_cats=False,
186
+ max_distance=100,
187
+ fps=30,
188
+ )
189
+ )
190
+
191
+ inference_pipeline = [
192
+ dict(
193
+ type='TransformBroadcaster',
194
+ transforms=[
195
+ dict(
196
+ type='Resize',
197
+ scale=(1333, 800),
198
+ keep_ratio=True),
199
+ ]),
200
+ dict(type='PackTrackInputs')
201
+ ]
202
+
203
+
204
+ # runtime settings
205
+ train_cfg = None
206
+ val_cfg = dict(type='ValLoop')
207
+ test_cfg = dict(type='TestLoop')
208
+
209
+ default_hooks = dict(
210
+ logger=dict(type='LoggerHook', interval=50),
211
+ visualization=dict(type='TrackVisualizationHook', draw=False),
212
+ checkpoint = dict(type='CheckpointHook', interval=1),
213
+ )
214
+
215
+ vis_backends = [dict(type='LocalVisBackend')]
216
+ visualizer = dict(
217
+ type='MasaTrackLocalVisualizer', vis_backends=vis_backends, name='visualizer')
218
+
configs/masa-gdino/open_vocabulary_mot_test/masa_gdino_swinb_open_vocabulary_test.py ADDED
@@ -0,0 +1,236 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _base_ = [
2
+ '../../../projects/grounding_dino/grounding_dino_swin-b_pretrain_mixeddata_masa.py',
3
+ '../../datasets/tao/tao_dataset_v1.py',
4
+ '../../default_runtime.py'
5
+ ]
6
+ default_scope = 'mmdet'
7
+ detector = _base_.model
8
+ # detector.backbone.update(dict(out_indices=(1, 2, 3)))
9
+ detector.pop('data_preprocessor')
10
+ detector['init_cfg'] = dict(
11
+ type='Pretrained',
12
+ checkpoint= 'saved_models/tsa_models/groundingdino_swinb_cogcoor_mmdet-55949c9c.pth'
13
+ # noqa: E501
14
+ )
15
+ detector['type'] = 'GroundingDINOMasa'
16
+
17
+ del _base_.model
18
+
19
+ model = dict(
20
+ type='MASA',
21
+ freeze_detector=True,
22
+ unified_backbone=True,
23
+ load_public_dets = True,
24
+ benchmark = 'tao',
25
+ public_det_path = 'results/public_dets/tao_val_dets/teta_50_internms/detic_tao_val_det/',
26
+ data_preprocessor=dict(
27
+ type='TrackDataPreprocessor',
28
+ # Image normalization parameters
29
+ mean=[123.675, 116.28, 103.53],
30
+ std=[58.395, 57.12, 57.375],
31
+ bgr_to_rgb=True,
32
+ # Image padding parameters
33
+ pad_mask=False, # In instance segmentation, the mask needs to be padded
34
+ pad_size_divisor=1024, # Padding the image to multiples of 32
35
+ ),
36
+ detector=detector,
37
+ masa_adapter=[
38
+ dict(
39
+ type='FPN',
40
+ in_channels=[256, 512, 1024],
41
+ out_channels=256,
42
+ norm_cfg=dict(type='SyncBN', requires_grad=True),
43
+ num_outs=5),
44
+ dict(
45
+ type='DeformFusion',
46
+ in_channels=256,
47
+ out_channels=256,
48
+ num_blocks=3)],
49
+ rpn_head=dict(
50
+ type='RPNHead',
51
+ in_channels=256,
52
+ feat_channels=256,
53
+ anchor_generator=dict(
54
+ type='AnchorGenerator',
55
+ scales=[8],
56
+ ratios=[0.5, 1.0, 2.0],
57
+ strides=[8, 16, 32, 64, 128]),
58
+ bbox_coder=dict(
59
+ type='DeltaXYWHBBoxCoder',
60
+ target_means=[.0, .0, .0, .0],
61
+ target_stds=[1.0, 1.0, 1.0, 1.0]),
62
+ loss_cls=dict(
63
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
64
+ loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)
65
+ ),
66
+ roi_head=dict(
67
+ type='StandardRoIHead',
68
+ bbox_roi_extractor=dict(
69
+ type='SingleRoIExtractor',
70
+ roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
71
+ out_channels=256,
72
+ featmap_strides=[8, 16, 32]),
73
+ bbox_head=dict(
74
+ type='Shared2FCBBoxHead',
75
+ in_channels=256,
76
+ fc_out_channels=1024,
77
+ roi_feat_size=7,
78
+ num_classes=1,
79
+ bbox_coder=dict(
80
+ type='DeltaXYWHBBoxCoder',
81
+ target_means=[0., 0., 0., 0.],
82
+ target_stds=[0.1, 0.1, 0.2, 0.2]),
83
+ reg_class_agnostic=True,
84
+ loss_cls=dict(
85
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
86
+ loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
87
+ # model training and testing settings
88
+ train_cfg=dict(
89
+ rpn=dict(
90
+ assigner=dict(
91
+ type='MaxIoUAssigner',
92
+ pos_iou_thr=0.7,
93
+ neg_iou_thr=0.3,
94
+ min_pos_iou=0.3,
95
+ match_low_quality=True,
96
+ ignore_iof_thr=-1),
97
+ sampler=dict(
98
+ type='RandomSampler',
99
+ num=256,
100
+ pos_fraction=0.5,
101
+ neg_pos_ub=-1,
102
+ add_gt_as_proposals=False),
103
+ allowed_border=-1,
104
+ pos_weight=-1,
105
+ debug=False),
106
+ rpn_proposal=dict(
107
+ nms_pre=2000,
108
+ max_per_img=1000,
109
+ nms=dict(type='nms', iou_threshold=0.7),
110
+ min_bbox_size=0),
111
+ rcnn=dict(
112
+ assigner=dict(
113
+ type='MaxIoUAssigner',
114
+ pos_iou_thr=0.5,
115
+ neg_iou_thr=0.5,
116
+ min_pos_iou=0.5,
117
+ match_low_quality=False,
118
+ ignore_iof_thr=-1),
119
+ sampler=dict(
120
+ type='RandomSampler',
121
+ num=512,
122
+ pos_fraction=0.25,
123
+ neg_pos_ub=-1,
124
+ add_gt_as_proposals=True),
125
+ pos_weight=-1,
126
+ debug=False)),
127
+ test_cfg=dict(
128
+ rpn=dict(
129
+ nms_pre=1000,
130
+ max_per_img=1000,
131
+ nms=dict(type='nms', iou_threshold=0.7),
132
+ min_bbox_size=0),
133
+ rcnn=dict(
134
+ score_thr=0.02,
135
+ # nms=dict(type='nms', iou_threshold=0.5),
136
+ nms=dict(type='nms',
137
+ iou_threshold=0.5,
138
+ class_agnostic=True,
139
+ split_thr=100000),
140
+ max_per_img=50,
141
+ mask_thr_binary=0.5)
142
+ # soft-nms is also supported for rcnn testing
143
+ # e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05)
144
+ ),
145
+ track_head=dict(
146
+ type='MasaTrackHead',
147
+ roi_extractor=dict(
148
+ type='SingleRoIExtractor',
149
+ roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
150
+ out_channels=256,
151
+ featmap_strides=[8, 16, 32]),
152
+ embed_head=dict(
153
+ type='QuasiDenseEmbedHead',
154
+ num_convs=4,
155
+ num_fcs=1,
156
+ embed_channels=256,
157
+ norm_cfg=dict(type='GN', num_groups=32),
158
+ loss_track=dict(type='UnbiasedContrastLoss', loss_weight=0.25),
159
+ loss_track_aux=dict(
160
+ type='MarginL2Loss',
161
+ neg_pos_ub=3,
162
+ pos_margin=0,
163
+ neg_margin=0.1,
164
+ hard_mining=True,
165
+ loss_weight=1.0)),
166
+ train_cfg=dict(
167
+ assigner=dict(
168
+ type='MaxIoUAssigner',
169
+ pos_iou_thr=0.7,
170
+ neg_iou_thr=0.3,
171
+ min_pos_iou=0.5,
172
+ match_low_quality=False,
173
+ ignore_iof_thr=-1),
174
+ sampler=dict(
175
+ type='CombinedSampler',
176
+ num=512,
177
+ pos_fraction=0.5,
178
+ neg_pos_ub=3,
179
+ add_gt_as_proposals=True,
180
+ pos_sampler=dict(type='InstanceBalancedPosSampler'),
181
+ neg_sampler=dict(type='RandomSampler')))),
182
+ tracker=dict(
183
+ type='MasaTaoTracker',
184
+ init_score_thr=0.0001,
185
+ obj_score_thr=0.0001,
186
+ match_score_thr=0.5,
187
+ memo_tracklet_frames=10,
188
+ memo_momentum=0.8,
189
+ with_cats=False,
190
+ max_distance=-1,
191
+ fps=1,
192
+ )
193
+ )
194
+
195
+ test_pipeline = [
196
+ dict(
197
+ type='TransformBroadcaster',
198
+ transforms=[
199
+ dict(type='LoadImageFromFile'),
200
+ dict(
201
+ type='Resize',
202
+ scale=(1024, 1024),
203
+ keep_ratio=True),
204
+ dict(type='LoadTrackAnnotations')
205
+ ]),
206
+ dict(type='PackTrackInputs')
207
+ ]
208
+
209
+ # runtime settings
210
+ train_dataloader = None
211
+ train_cfg = None
212
+ val_cfg = dict(type='ValLoop')
213
+ test_cfg = dict(type='TestLoop')
214
+
215
+ default_hooks = dict(
216
+ logger=dict(type='LoggerHook', interval=50),
217
+ visualization=dict(type='TrackVisualizationHook', draw=False),
218
+ checkpoint = dict(type='CheckpointHook', interval=1),
219
+ )
220
+
221
+ vis_backends = [dict(type='LocalVisBackend')]
222
+ visualizer = dict(
223
+ type='MasaTrackLocalVisualizer', vis_backends=vis_backends, name='visualizer')
224
+
225
+ val_dataloader = dict(
226
+ dataset=dict(
227
+ ann_file='data/tao/annotations/tao_val_lvis_v1_classes.json',
228
+ pipeline=test_pipeline,
229
+ )
230
+ )
231
+ test_dataloader = val_dataloader
232
+ test_evaluator = dict(
233
+ ann_file='data/tao/annotations/tao_val_lvis_v1_classes.json',
234
+ outfile_prefix='results/masa_results/masa-groundingdino-release-ovmot-test',
235
+ open_vocabulary=True,
236
+ )
configs/masa-gdino/tao_teta_test/masa_gdino_swinb_tao_test_detic_dets.py ADDED
@@ -0,0 +1,235 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _base_ = [
2
+ '../../../projects/grounding_dino/grounding_dino_swin-b_pretrain_mixeddata_masa.py',
3
+ '../../datasets/tao/tao_dataset_v1.py',
4
+ '../../default_runtime.py'
5
+ ]
6
+ default_scope = 'mmdet'
7
+ detector = _base_.model
8
+ # detector.backbone.update(dict(out_indices=(1, 2, 3)))
9
+ detector.pop('data_preprocessor')
10
+ detector['init_cfg'] = dict(
11
+ type='Pretrained',
12
+ checkpoint= 'saved_models/tsa_models/groundingdino_swinb_cogcoor_mmdet-55949c9c.pth'
13
+ # noqa: E501
14
+ )
15
+ detector['type'] = 'GroundingDINOMasa'
16
+
17
+ del _base_.model
18
+
19
+ model = dict(
20
+ type='MASA',
21
+ freeze_detector=True,
22
+ unified_backbone=True,
23
+ load_public_dets = True,
24
+ benchmark = 'tao',
25
+ public_det_path = 'results/public_dets/tao_val_dets/teta_50_internms/detic_tao_val_det/',
26
+ data_preprocessor=dict(
27
+ type='TrackDataPreprocessor',
28
+ # Image normalization parameters
29
+ mean=[123.675, 116.28, 103.53],
30
+ std=[58.395, 57.12, 57.375],
31
+ bgr_to_rgb=True,
32
+ # Image padding parameters
33
+ pad_mask=False, # In instance segmentation, the mask needs to be padded
34
+ pad_size_divisor=1024, # Padding the image to multiples of 32
35
+ ),
36
+ detector=detector,
37
+ masa_adapter=[
38
+ dict(
39
+ type='FPN',
40
+ in_channels=[256, 512, 1024],
41
+ out_channels=256,
42
+ norm_cfg=dict(type='SyncBN', requires_grad=True),
43
+ num_outs=5),
44
+ dict(
45
+ type='DeformFusion',
46
+ in_channels=256,
47
+ out_channels=256,
48
+ num_blocks=3)],
49
+ rpn_head=dict(
50
+ type='RPNHead',
51
+ in_channels=256,
52
+ feat_channels=256,
53
+ anchor_generator=dict(
54
+ type='AnchorGenerator',
55
+ scales=[8],
56
+ ratios=[0.5, 1.0, 2.0],
57
+ strides=[8, 16, 32, 64, 128]),
58
+ bbox_coder=dict(
59
+ type='DeltaXYWHBBoxCoder',
60
+ target_means=[.0, .0, .0, .0],
61
+ target_stds=[1.0, 1.0, 1.0, 1.0]),
62
+ loss_cls=dict(
63
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
64
+ loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)
65
+ ),
66
+ roi_head=dict(
67
+ type='StandardRoIHead',
68
+ bbox_roi_extractor=dict(
69
+ type='SingleRoIExtractor',
70
+ roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
71
+ out_channels=256,
72
+ featmap_strides=[8, 16, 32]),
73
+ bbox_head=dict(
74
+ type='Shared2FCBBoxHead',
75
+ in_channels=256,
76
+ fc_out_channels=1024,
77
+ roi_feat_size=7,
78
+ num_classes=1,
79
+ bbox_coder=dict(
80
+ type='DeltaXYWHBBoxCoder',
81
+ target_means=[0., 0., 0., 0.],
82
+ target_stds=[0.1, 0.1, 0.2, 0.2]),
83
+ reg_class_agnostic=True,
84
+ loss_cls=dict(
85
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
86
+ loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
87
+ # model training and testing settings
88
+ train_cfg=dict(
89
+ rpn=dict(
90
+ assigner=dict(
91
+ type='MaxIoUAssigner',
92
+ pos_iou_thr=0.7,
93
+ neg_iou_thr=0.3,
94
+ min_pos_iou=0.3,
95
+ match_low_quality=True,
96
+ ignore_iof_thr=-1),
97
+ sampler=dict(
98
+ type='RandomSampler',
99
+ num=256,
100
+ pos_fraction=0.5,
101
+ neg_pos_ub=-1,
102
+ add_gt_as_proposals=False),
103
+ allowed_border=-1,
104
+ pos_weight=-1,
105
+ debug=False),
106
+ rpn_proposal=dict(
107
+ nms_pre=2000,
108
+ max_per_img=1000,
109
+ nms=dict(type='nms', iou_threshold=0.7),
110
+ min_bbox_size=0),
111
+ rcnn=dict(
112
+ assigner=dict(
113
+ type='MaxIoUAssigner',
114
+ pos_iou_thr=0.5,
115
+ neg_iou_thr=0.5,
116
+ min_pos_iou=0.5,
117
+ match_low_quality=False,
118
+ ignore_iof_thr=-1),
119
+ sampler=dict(
120
+ type='RandomSampler',
121
+ num=512,
122
+ pos_fraction=0.25,
123
+ neg_pos_ub=-1,
124
+ add_gt_as_proposals=True),
125
+ pos_weight=-1,
126
+ debug=False)),
127
+ test_cfg=dict(
128
+ rpn=dict(
129
+ nms_pre=1000,
130
+ max_per_img=1000,
131
+ nms=dict(type='nms', iou_threshold=0.7),
132
+ min_bbox_size=0),
133
+ rcnn=dict(
134
+ score_thr=0.02,
135
+ # nms=dict(type='nms', iou_threshold=0.5),
136
+ nms=dict(type='nms',
137
+ iou_threshold=0.5,
138
+ class_agnostic=True,
139
+ split_thr=100000),
140
+ max_per_img=50,
141
+ mask_thr_binary=0.5)
142
+ # soft-nms is also supported for rcnn testing
143
+ # e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05)
144
+ ),
145
+ track_head=dict(
146
+ type='MasaTrackHead',
147
+ roi_extractor=dict(
148
+ type='SingleRoIExtractor',
149
+ roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
150
+ out_channels=256,
151
+ featmap_strides=[8, 16, 32]),
152
+ embed_head=dict(
153
+ type='QuasiDenseEmbedHead',
154
+ num_convs=4,
155
+ num_fcs=1,
156
+ embed_channels=256,
157
+ norm_cfg=dict(type='GN', num_groups=32),
158
+ loss_track=dict(type='UnbiasedContrastLoss', loss_weight=0.25),
159
+ loss_track_aux=dict(
160
+ type='MarginL2Loss',
161
+ neg_pos_ub=3,
162
+ pos_margin=0,
163
+ neg_margin=0.1,
164
+ hard_mining=True,
165
+ loss_weight=1.0)),
166
+ train_cfg=dict(
167
+ assigner=dict(
168
+ type='MaxIoUAssigner',
169
+ pos_iou_thr=0.7,
170
+ neg_iou_thr=0.3,
171
+ min_pos_iou=0.5,
172
+ match_low_quality=False,
173
+ ignore_iof_thr=-1),
174
+ sampler=dict(
175
+ type='CombinedSampler',
176
+ num=512,
177
+ pos_fraction=0.5,
178
+ neg_pos_ub=3,
179
+ add_gt_as_proposals=True,
180
+ pos_sampler=dict(type='InstanceBalancedPosSampler'),
181
+ neg_sampler=dict(type='RandomSampler')))),
182
+ tracker=dict(
183
+ type='MasaTaoTracker',
184
+ init_score_thr=0.0001,
185
+ obj_score_thr=0.0001,
186
+ match_score_thr=0.5,
187
+ memo_tracklet_frames=10,
188
+ memo_momentum=0.8,
189
+ with_cats=False,
190
+ max_distance=-1,
191
+ fps=1,
192
+ )
193
+ )
194
+
195
+ test_pipeline = [
196
+ dict(
197
+ type='TransformBroadcaster',
198
+ transforms=[
199
+ dict(type='LoadImageFromFile'),
200
+ dict(
201
+ type='Resize',
202
+ scale=(1024, 1024),
203
+ keep_ratio=True),
204
+ dict(type='LoadTrackAnnotations')
205
+ ]),
206
+ dict(type='PackTrackInputs')
207
+ ]
208
+
209
+ # runtime settings
210
+ train_dataloader = None
211
+ train_cfg = None
212
+ val_cfg = dict(type='ValLoop')
213
+ test_cfg = dict(type='TestLoop')
214
+
215
+ default_hooks = dict(
216
+ logger=dict(type='LoggerHook', interval=50),
217
+ visualization=dict(type='TrackVisualizationHook', draw=False),
218
+ checkpoint = dict(type='CheckpointHook', interval=1),
219
+ )
220
+
221
+ vis_backends = [dict(type='LocalVisBackend')]
222
+ visualizer = dict(
223
+ type='MasaTrackLocalVisualizer', vis_backends=vis_backends, name='visualizer')
224
+
225
+ val_dataloader = dict(
226
+ dataset=dict(
227
+ ann_file='data/tao/annotations/tao_val_lvis_v1_classes.json',
228
+ pipeline=test_pipeline,
229
+ )
230
+ )
231
+ test_dataloader = val_dataloader
232
+ test_evaluator = dict(
233
+ ann_file='data/tao/annotations/tao_val_lvis_v1_classes.json',
234
+ outfile_prefix='results/masa_results/masa-groundingdino-release_detic_dets-test',
235
+ )
configs/masa-gdino/tao_teta_test/masa_gdino_swinb_tao_test_teter_swinT_dets.py ADDED
@@ -0,0 +1,240 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _base_ = [
2
+ '../../../projects/grounding_dino/grounding_dino_swin-b_pretrain_mixeddata_masa.py',
3
+ '../../datasets/tao/tao_dataset_v05.py',
4
+ '../../default_runtime.py'
5
+ ]
6
+ default_scope = 'mmdet'
7
+ detector = _base_.model
8
+ # detector.backbone.update(dict(out_indices=(1, 2, 3)))
9
+ detector.pop('data_preprocessor')
10
+ detector['init_cfg'] = dict(
11
+ type='Pretrained',
12
+ checkpoint= 'saved_models/tsa_models/groundingdino_swinb_cogcoor_mmdet-55949c9c.pth'
13
+ # noqa: E501
14
+ )
15
+ detector['type'] = 'GroundingDINOMasa'
16
+
17
+ del _base_.model
18
+
19
+ model = dict(
20
+ type='MASA',
21
+ freeze_detector=True,
22
+ unified_backbone=True,
23
+ load_public_dets = True,
24
+ public_det_path = 'results/public_dets/tao_val_dets/teta_50_internms/teter_swinT_tao_val_internms_50/',
25
+ data_preprocessor=dict(
26
+ type='TrackDataPreprocessor',
27
+ # Image normalization parameters
28
+ mean=[123.675, 116.28, 103.53],
29
+ std=[58.395, 57.12, 57.375],
30
+ bgr_to_rgb=True,
31
+ # Image padding parameters
32
+ pad_mask=False, # In instance segmentation, the mask needs to be padded
33
+ pad_size_divisor=1024, # Padding the image to multiples of 32
34
+ ),
35
+ detector=detector,
36
+ masa_adapter=[
37
+ dict(
38
+ type='FPN',
39
+ in_channels=[256, 512, 1024],
40
+ out_channels=256,
41
+ norm_cfg=dict(type='SyncBN', requires_grad=True),
42
+ num_outs=5),
43
+ dict(
44
+ type='DeformFusion',
45
+ in_channels=256,
46
+ out_channels=256,
47
+ num_blocks=3)],
48
+ rpn_head=dict(
49
+ type='RPNHead',
50
+ in_channels=256,
51
+ feat_channels=256,
52
+ anchor_generator=dict(
53
+ type='AnchorGenerator',
54
+ scales=[8],
55
+ ratios=[0.5, 1.0, 2.0],
56
+ strides=[8, 16, 32, 64, 128]),
57
+ bbox_coder=dict(
58
+ type='DeltaXYWHBBoxCoder',
59
+ target_means=[.0, .0, .0, .0],
60
+ target_stds=[1.0, 1.0, 1.0, 1.0]),
61
+ loss_cls=dict(
62
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
63
+ loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)
64
+ ),
65
+ roi_head=dict(
66
+ type='StandardRoIHead',
67
+ bbox_roi_extractor=dict(
68
+ type='SingleRoIExtractor',
69
+ roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
70
+ out_channels=256,
71
+ featmap_strides=[8, 16, 32]),
72
+ bbox_head=dict(
73
+ type='Shared2FCBBoxHead',
74
+ in_channels=256,
75
+ fc_out_channels=1024,
76
+ roi_feat_size=7,
77
+ num_classes=1,
78
+ bbox_coder=dict(
79
+ type='DeltaXYWHBBoxCoder',
80
+ target_means=[0., 0., 0., 0.],
81
+ target_stds=[0.1, 0.1, 0.2, 0.2]),
82
+ reg_class_agnostic=True,
83
+ loss_cls=dict(
84
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
85
+ loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
86
+ # model training and testing settings
87
+ train_cfg=dict(
88
+ rpn=dict(
89
+ assigner=dict(
90
+ type='MaxIoUAssigner',
91
+ pos_iou_thr=0.7,
92
+ neg_iou_thr=0.3,
93
+ min_pos_iou=0.3,
94
+ match_low_quality=True,
95
+ ignore_iof_thr=-1),
96
+ sampler=dict(
97
+ type='RandomSampler',
98
+ num=256,
99
+ pos_fraction=0.5,
100
+ neg_pos_ub=-1,
101
+ add_gt_as_proposals=False),
102
+ allowed_border=-1,
103
+ pos_weight=-1,
104
+ debug=False),
105
+ rpn_proposal=dict(
106
+ nms_pre=2000,
107
+ max_per_img=1000,
108
+ nms=dict(type='nms', iou_threshold=0.7),
109
+ min_bbox_size=0),
110
+ rcnn=dict(
111
+ assigner=dict(
112
+ type='MaxIoUAssigner',
113
+ pos_iou_thr=0.5,
114
+ neg_iou_thr=0.5,
115
+ min_pos_iou=0.5,
116
+ match_low_quality=False,
117
+ ignore_iof_thr=-1),
118
+ sampler=dict(
119
+ type='RandomSampler',
120
+ num=512,
121
+ pos_fraction=0.25,
122
+ neg_pos_ub=-1,
123
+ add_gt_as_proposals=True),
124
+ pos_weight=-1,
125
+ debug=False)),
126
+ test_cfg=dict(
127
+ rpn=dict(
128
+ nms_pre=1000,
129
+ max_per_img=1000,
130
+ nms=dict(type='nms', iou_threshold=0.7),
131
+ min_bbox_size=0),
132
+ rcnn=dict(
133
+ score_thr=0.02,
134
+ # nms=dict(type='nms', iou_threshold=0.5),
135
+ nms=dict(type='nms',
136
+ iou_threshold=0.5,
137
+ class_agnostic=True,
138
+ split_thr=100000),
139
+ max_per_img=50,
140
+ mask_thr_binary=0.5)
141
+ # soft-nms is also supported for rcnn testing
142
+ # e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05)
143
+ ),
144
+ track_head=dict(
145
+ type='MasaTrackHead',
146
+ roi_extractor=dict(
147
+ type='SingleRoIExtractor',
148
+ roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
149
+ out_channels=256,
150
+ featmap_strides=[8, 16, 32]),
151
+ embed_head=dict(
152
+ type='QuasiDenseEmbedHead',
153
+ num_convs=4,
154
+ num_fcs=1,
155
+ embed_channels=256,
156
+ norm_cfg=dict(type='GN', num_groups=32),
157
+ loss_track=dict(type='UnbiasedContrastLoss', loss_weight=0.25),
158
+ loss_track_aux=dict(
159
+ type='MarginL2Loss',
160
+ neg_pos_ub=3,
161
+ pos_margin=0,
162
+ neg_margin=0.1,
163
+ hard_mining=True,
164
+ loss_weight=1.0)),
165
+ train_cfg=dict(
166
+ assigner=dict(
167
+ type='MaxIoUAssigner',
168
+ pos_iou_thr=0.7,
169
+ neg_iou_thr=0.3,
170
+ min_pos_iou=0.5,
171
+ match_low_quality=False,
172
+ ignore_iof_thr=-1),
173
+ sampler=dict(
174
+ type='CombinedSampler',
175
+ num=512,
176
+ pos_fraction=0.5,
177
+ neg_pos_ub=3,
178
+ add_gt_as_proposals=True,
179
+ pos_sampler=dict(type='InstanceBalancedPosSampler'),
180
+ neg_sampler=dict(type='RandomSampler')))),
181
+ tracker=dict(
182
+ type='MasaTaoTracker',
183
+ init_score_thr=0.0001,
184
+ obj_score_thr=0.0001,
185
+ match_score_thr=0.5,
186
+ memo_tracklet_frames=10,
187
+ memo_momentum=0.8,
188
+ with_cats=False,
189
+ max_distance=-1,
190
+ fps=1,
191
+ )
192
+ )
193
+
194
+ test_pipeline = [
195
+ dict(
196
+ type='TransformBroadcaster',
197
+ transforms=[
198
+ dict(type='LoadImageFromFile'),
199
+ dict(
200
+ type='Resize',
201
+ scale=(1024, 1024),
202
+ keep_ratio=True),
203
+ dict(type='LoadTrackAnnotations')
204
+ ]),
205
+ dict(type='PackTrackInputs')
206
+ ]
207
+
208
+
209
+ train_dataloader = None
210
+ train_cfg = None
211
+ val_cfg = dict(type='ValLoop')
212
+ test_cfg = dict(type='TestLoop')
213
+
214
+ default_hooks = dict(
215
+ logger=dict(type='LoggerHook', interval=50),
216
+ visualization=dict(type='TrackVisualizationHook', draw=False))
217
+
218
+ vis_backends = [dict(type='LocalVisBackend')]
219
+ visualizer = dict(
220
+ type='MasaTrackLocalVisualizer', vis_backends=vis_backends, name='visualizer')
221
+
222
+ # custom hooks
223
+ custom_hooks = [
224
+ # Synchronize model buffers such as running_mean and running_var in BN
225
+ # at the end of each epoch
226
+ dict(type='SyncBuffersHook')
227
+ ]
228
+ auto_scale_lr = dict(enable=False, base_batch_size=16)
229
+ val_dataloader = dict(
230
+ dataset=dict(
231
+ ann_file='data/tao/annotations/tao_val_lvis_v05_classes.json',
232
+ pipeline=test_pipeline,
233
+ )
234
+ )
235
+ test_dataloader = val_dataloader
236
+ val_evaluator = dict(
237
+ ann_file='data/tao/annotations/tao_val_lvis_v05_classes.json',
238
+ outfile_prefix='results/masa_results/masa-groundingdino-release-tao-teter-test',
239
+ )
240
+ test_evaluator = val_evaluator
configs/masa-one/bdd_test/masa_r50_bdd_mot_test.py ADDED
@@ -0,0 +1,235 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _base_ = [
2
+ '../../default_runtime.py',
3
+ '../../datasets/bdd/bdd_dataset.py',
4
+ ]
5
+ default_scope = 'mmdet'
6
+
7
+ model = dict(
8
+ type='MASA',
9
+ unified_backbone=False,
10
+ load_public_dets = True,
11
+ use_masa_backbone = True,
12
+ benchmark='bdd',
13
+ public_det_path='results/public_dets/bdd_mot_yolox_dets/',
14
+ data_preprocessor=dict(
15
+ type='TrackDataPreprocessor',
16
+ # Image normalization parameters
17
+ mean=[123.675, 116.28, 103.53],
18
+ std=[58.395, 57.12, 57.375],
19
+ bgr_to_rgb=True,
20
+ # Image padding parameters
21
+ pad_mask=True, # In instance segmentation, the mask needs to be padded
22
+ pad_size_divisor=32), # Padding the image to multiples of 32
23
+ backbone=dict(
24
+ type='ResNet',
25
+ depth=50,
26
+ num_stages=4,
27
+ out_indices=(0, 1, 2, 3),
28
+ frozen_stages=-1,
29
+ norm_cfg=dict(type='SyncBN', requires_grad=True),
30
+ norm_eval=True,
31
+ style='caffe',),
32
+ masa_adapter=[
33
+ dict(
34
+ type='FPN',
35
+ in_channels=[256, 512, 1024, 2048],
36
+ out_channels=256,
37
+ norm_cfg=dict(type='SyncBN', requires_grad=True),
38
+ num_outs=5),
39
+ dict(
40
+ type='DeformFusion',
41
+ in_channels=256,
42
+ out_channels=256,
43
+ num_blocks=3)],
44
+ rpn_head=dict(
45
+ type='RPNHead',
46
+ in_channels=256,
47
+ feat_channels=256,
48
+ anchor_generator=dict(
49
+ type='AnchorGenerator',
50
+ scales=[8],
51
+ ratios=[0.5, 1.0, 2.0],
52
+ strides=[4, 8, 16, 32, 64]),
53
+ bbox_coder=dict(
54
+ type='DeltaXYWHBBoxCoder',
55
+ target_means=[.0, .0, .0, .0],
56
+ target_stds=[1.0, 1.0, 1.0, 1.0]),
57
+ loss_cls=dict(
58
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
59
+ loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
60
+ roi_head=dict(
61
+ type='StandardRoIHead',
62
+ bbox_roi_extractor=dict(
63
+ type='SingleRoIExtractor',
64
+ roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
65
+ out_channels=256,
66
+ featmap_strides=[4, 8, 16, 32]),
67
+ bbox_head=dict(
68
+ type='Shared4Conv1FCBBoxHead',
69
+ in_channels=256,
70
+ fc_out_channels=1024,
71
+ roi_feat_size=7,
72
+ num_classes=1,
73
+ norm_cfg=dict(type='SyncBN', requires_grad=True),
74
+ bbox_coder=dict(
75
+ type='DeltaXYWHBBoxCoder',
76
+ target_means=[0., 0., 0., 0.],
77
+ target_stds=[0.1, 0.1, 0.2, 0.2]),
78
+ reg_class_agnostic=True,
79
+ loss_cls=dict(
80
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
81
+ loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
82
+ # model training and testing settings
83
+ train_cfg=dict(
84
+ rpn=dict(
85
+ assigner=dict(
86
+ type='MaxIoUAssigner',
87
+ pos_iou_thr=0.7,
88
+ neg_iou_thr=0.3,
89
+ min_pos_iou=0.3,
90
+ match_low_quality=True,
91
+ ignore_iof_thr=-1),
92
+ sampler=dict(
93
+ type='RandomSampler',
94
+ num=256,
95
+ pos_fraction=0.5,
96
+ neg_pos_ub=-1,
97
+ add_gt_as_proposals=False),
98
+ allowed_border=-1,
99
+ pos_weight=-1,
100
+ debug=False),
101
+ rpn_proposal=dict(
102
+ nms_pre=2000,
103
+ max_per_img=1000,
104
+ nms=dict(type='nms', iou_threshold=0.7),
105
+ min_bbox_size=0),
106
+ rcnn=dict(
107
+ assigner=dict(
108
+ type='MaxIoUAssigner',
109
+ pos_iou_thr=0.5,
110
+ neg_iou_thr=0.5,
111
+ min_pos_iou=0.5,
112
+ match_low_quality=False,
113
+ ignore_iof_thr=-1),
114
+ sampler=dict(
115
+ type='RandomSampler',
116
+ num=512,
117
+ pos_fraction=0.25,
118
+ neg_pos_ub=-1,
119
+ add_gt_as_proposals=True),
120
+ pos_weight=-1,
121
+ debug=False)),
122
+ test_cfg=dict(
123
+ rpn=dict(
124
+ nms_pre=1000,
125
+ max_per_img=1000,
126
+ nms=dict(type='nms', iou_threshold=0.7),
127
+ min_bbox_size=0),
128
+ rcnn=dict(
129
+ score_thr=0.02,
130
+ # nms=dict(type='nms', iou_threshold=0.5),
131
+ nms=dict(type='nms',
132
+ iou_threshold=0.5,
133
+ class_agnostic=True,
134
+ split_thr=100000),
135
+ max_per_img=50,
136
+ mask_thr_binary=0.5)
137
+ # soft-nms is also supported for rcnn testing
138
+ # e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05)
139
+ ),
140
+ track_head=dict(
141
+ type='MasaTrackHead',
142
+ roi_extractor=dict(
143
+ type='SingleRoIExtractor',
144
+ roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
145
+ out_channels=256,
146
+ featmap_strides=[4, 8, 16, 32]),
147
+ embed_head=dict(
148
+ type='QuasiDenseEmbedHead',
149
+ num_convs=4,
150
+ num_fcs=1,
151
+ embed_channels=256,
152
+ norm_cfg=dict(type='GN', num_groups=32),
153
+ loss_track=dict(type='UnbiasedContrastLoss', loss_weight=0.25),
154
+ loss_track_aux=dict(
155
+ type='MarginL2Loss',
156
+ neg_pos_ub=3,
157
+ pos_margin=0,
158
+ neg_margin=0.1,
159
+ hard_mining=True,
160
+ loss_weight=1.0)),
161
+ train_cfg=dict(
162
+ assigner=dict(
163
+ type='MaxIoUAssigner',
164
+ pos_iou_thr=0.7,
165
+ neg_iou_thr=0.3,
166
+ min_pos_iou=0.5,
167
+ match_low_quality=False,
168
+ ignore_iof_thr=-1),
169
+ sampler=dict(
170
+ type='CombinedSampler',
171
+ num=512,
172
+ pos_fraction=0.5,
173
+ neg_pos_ub=3,
174
+ add_gt_as_proposals=True,
175
+ pos_sampler=dict(type='InstanceBalancedPosSampler'),
176
+ neg_sampler=dict(type='RandomSampler')))),
177
+ tracker=dict(
178
+ type='MasaBDDTracker',
179
+ init_score_thr=0.5,
180
+ obj_score_thr=0.3,
181
+ match_score_thr=0.6,
182
+ memo_tracklet_frames=10,
183
+ memo_backdrop_frames=1,
184
+ memo_momentum=0.8,
185
+ nms_conf_thr=0.5,
186
+ nms_backdrop_iou_thr=0.3,
187
+ nms_class_iou_thr=0.7,
188
+ with_cats=False,
189
+ match_metric='bisoftmax')
190
+ )
191
+
192
+ test_pipeline = [
193
+ dict(
194
+ type='TransformBroadcaster',
195
+ transforms=[
196
+ dict(type='LoadImageFromFile'),
197
+ dict(
198
+ type='Resize',
199
+ scale=(1024, 1024),
200
+ keep_ratio=True),
201
+ dict(type='LoadTrackAnnotations')
202
+ ]),
203
+ dict(type='PackTrackInputs')
204
+ ]
205
+
206
+ # runtime settings
207
+ train_dataloader = None
208
+ train_cfg = None
209
+ val_cfg = dict(type='ValLoop')
210
+ test_cfg = dict(type='TestLoop')
211
+
212
+ default_hooks = dict(
213
+ logger=dict(type='LoggerHook', interval=50),
214
+ visualization=dict(type='TrackVisualizationHook', draw=False),
215
+ checkpoint = dict(type='CheckpointHook', interval=1),
216
+ )
217
+
218
+ vis_backends = [dict(type='LocalVisBackend')]
219
+ visualizer = dict(
220
+ type='MasaTrackLocalVisualizer', vis_backends=vis_backends, name='visualizer')
221
+
222
+ val_dataloader = dict(
223
+ dataset=dict(
224
+ ann_file='data/bdd/annotations/box_track_20/box_track_val_cocofmt.json',
225
+ pipeline=test_pipeline,
226
+ )
227
+ )
228
+ test_dataloader = val_dataloader
229
+ val_evaluator = dict(
230
+ ann_file='data/bdd/annotations/box_track_20/box_track_val_cocofmt.json',
231
+ scalabel_gt='data/bdd/annotations/scalabel_gt/box_track_20/val/',
232
+ outfile_prefix='results/masa_results/masa-r50-release-bdd-mot-test',
233
+ metric=['TETA', 'HOTA', 'CLEAR']
234
+ )
235
+ test_evaluator = val_evaluator
configs/masa-one/bdd_test/masa_r50_bdd_mots_test.py ADDED
@@ -0,0 +1,238 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _base_ = [
2
+ '../../default_runtime.py',
3
+ '../../datasets/bdd/bdd_dataset.py',
4
+ ]
5
+ default_scope = 'mmdet'
6
+
7
+ model = dict(
8
+ type='MASA',
9
+ unified_backbone=False,
10
+ load_public_dets = True,
11
+ use_masa_backbone = True,
12
+ benchmark='bdd',
13
+ with_segm=True,
14
+ public_det_path = 'results/public_dets/bdd_mots_val_uninext_dets/',
15
+ data_preprocessor=dict(
16
+ type='TrackDataPreprocessor',
17
+ # Image normalization parameters
18
+ mean=[123.675, 116.28, 103.53],
19
+ std=[58.395, 57.12, 57.375],
20
+ bgr_to_rgb=True,
21
+ # Image padding parameters
22
+ pad_mask=True, # In instance segmentation, the mask needs to be padded
23
+ pad_size_divisor=32), # Padding the image to multiples of 32
24
+ backbone=dict(
25
+ type='ResNet',
26
+ depth=50,
27
+ num_stages=4,
28
+ out_indices=(0, 1, 2, 3),
29
+ frozen_stages=-1,
30
+ norm_cfg=dict(type='SyncBN', requires_grad=True),
31
+ norm_eval=True,
32
+ style='caffe',),
33
+ masa_adapter=[
34
+ dict(
35
+ type='FPN',
36
+ in_channels=[256, 512, 1024, 2048],
37
+ out_channels=256,
38
+ norm_cfg=dict(type='SyncBN', requires_grad=True),
39
+ num_outs=5),
40
+ dict(
41
+ type='DeformFusion',
42
+ in_channels=256,
43
+ out_channels=256,
44
+ num_blocks=3)],
45
+ rpn_head=dict(
46
+ type='RPNHead',
47
+ in_channels=256,
48
+ feat_channels=256,
49
+ anchor_generator=dict(
50
+ type='AnchorGenerator',
51
+ scales=[8],
52
+ ratios=[0.5, 1.0, 2.0],
53
+ strides=[4, 8, 16, 32, 64]),
54
+ bbox_coder=dict(
55
+ type='DeltaXYWHBBoxCoder',
56
+ target_means=[.0, .0, .0, .0],
57
+ target_stds=[1.0, 1.0, 1.0, 1.0]),
58
+ loss_cls=dict(
59
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
60
+ loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
61
+ roi_head=dict(
62
+ type='StandardRoIHead',
63
+ bbox_roi_extractor=dict(
64
+ type='SingleRoIExtractor',
65
+ roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
66
+ out_channels=256,
67
+ featmap_strides=[4, 8, 16, 32]),
68
+ bbox_head=dict(
69
+ type='Shared4Conv1FCBBoxHead',
70
+ in_channels=256,
71
+ fc_out_channels=1024,
72
+ roi_feat_size=7,
73
+ num_classes=1,
74
+ norm_cfg=dict(type='SyncBN', requires_grad=True),
75
+ bbox_coder=dict(
76
+ type='DeltaXYWHBBoxCoder',
77
+ target_means=[0., 0., 0., 0.],
78
+ target_stds=[0.1, 0.1, 0.2, 0.2]),
79
+ reg_class_agnostic=True,
80
+ loss_cls=dict(
81
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
82
+ loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
83
+ # model training and testing settings
84
+ train_cfg=dict(
85
+ rpn=dict(
86
+ assigner=dict(
87
+ type='MaxIoUAssigner',
88
+ pos_iou_thr=0.7,
89
+ neg_iou_thr=0.3,
90
+ min_pos_iou=0.3,
91
+ match_low_quality=True,
92
+ ignore_iof_thr=-1),
93
+ sampler=dict(
94
+ type='RandomSampler',
95
+ num=256,
96
+ pos_fraction=0.5,
97
+ neg_pos_ub=-1,
98
+ add_gt_as_proposals=False),
99
+ allowed_border=-1,
100
+ pos_weight=-1,
101
+ debug=False),
102
+ rpn_proposal=dict(
103
+ nms_pre=2000,
104
+ max_per_img=1000,
105
+ nms=dict(type='nms', iou_threshold=0.7),
106
+ min_bbox_size=0),
107
+ rcnn=dict(
108
+ assigner=dict(
109
+ type='MaxIoUAssigner',
110
+ pos_iou_thr=0.5,
111
+ neg_iou_thr=0.5,
112
+ min_pos_iou=0.5,
113
+ match_low_quality=False,
114
+ ignore_iof_thr=-1),
115
+ sampler=dict(
116
+ type='RandomSampler',
117
+ num=512,
118
+ pos_fraction=0.25,
119
+ neg_pos_ub=-1,
120
+ add_gt_as_proposals=True),
121
+ pos_weight=-1,
122
+ debug=False)),
123
+ test_cfg=dict(
124
+ rpn=dict(
125
+ nms_pre=1000,
126
+ max_per_img=1000,
127
+ nms=dict(type='nms', iou_threshold=0.7),
128
+ min_bbox_size=0),
129
+ rcnn=dict(
130
+ score_thr=0.02,
131
+ # nms=dict(type='nms', iou_threshold=0.5),
132
+ nms=dict(type='nms',
133
+ iou_threshold=0.5,
134
+ class_agnostic=True,
135
+ split_thr=100000),
136
+ max_per_img=50,
137
+ mask_thr_binary=0.5)
138
+ # soft-nms is also supported for rcnn testing
139
+ # e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05)
140
+ ),
141
+ track_head=dict(
142
+ type='MasaTrackHead',
143
+ roi_extractor=dict(
144
+ type='SingleRoIExtractor',
145
+ roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
146
+ out_channels=256,
147
+ featmap_strides=[4, 8, 16, 32]),
148
+ embed_head=dict(
149
+ type='QuasiDenseEmbedHead',
150
+ num_convs=4,
151
+ num_fcs=1,
152
+ embed_channels=256,
153
+ norm_cfg=dict(type='GN', num_groups=32),
154
+ loss_track=dict(type='UnbiasedContrastLoss', loss_weight=0.25),
155
+ loss_track_aux=dict(
156
+ type='MarginL2Loss',
157
+ neg_pos_ub=3,
158
+ pos_margin=0,
159
+ neg_margin=0.1,
160
+ hard_mining=True,
161
+ loss_weight=1.0)),
162
+ train_cfg=dict(
163
+ assigner=dict(
164
+ type='MaxIoUAssigner',
165
+ pos_iou_thr=0.7,
166
+ neg_iou_thr=0.3,
167
+ min_pos_iou=0.5,
168
+ match_low_quality=False,
169
+ ignore_iof_thr=-1),
170
+ sampler=dict(
171
+ type='CombinedSampler',
172
+ num=512,
173
+ pos_fraction=0.5,
174
+ neg_pos_ub=3,
175
+ add_gt_as_proposals=True,
176
+ pos_sampler=dict(type='InstanceBalancedPosSampler'),
177
+ neg_sampler=dict(type='RandomSampler')))),
178
+ tracker=dict(
179
+ type='MasaBDDTracker',
180
+ init_score_thr=0.5,
181
+ obj_score_thr=0.3,
182
+ match_score_thr=0.6,
183
+ memo_tracklet_frames=10,
184
+ memo_backdrop_frames=1,
185
+ memo_momentum=0.8,
186
+ nms_conf_thr=0.5,
187
+ nms_backdrop_iou_thr=0.3,
188
+ nms_class_iou_thr=0.7,
189
+ with_cats=False,
190
+ match_metric='bisoftmax')
191
+ )
192
+
193
+ test_pipeline = [
194
+ dict(
195
+ type='TransformBroadcaster',
196
+ transforms=[
197
+ dict(type='LoadImageFromFile'),
198
+ dict(
199
+ type='Resize',
200
+ scale=(1024, 1024),
201
+ keep_ratio=True),
202
+ dict(type='LoadTrackAnnotations')
203
+ ]),
204
+ dict(type='PackTrackInputs')
205
+ ]
206
+
207
+ # runtime settings
208
+ train_dataloader = None
209
+ train_cfg = None
210
+ val_cfg = dict(type='ValLoop')
211
+ test_cfg = dict(type='TestLoop')
212
+
213
+ default_hooks = dict(
214
+ logger=dict(type='LoggerHook', interval=50),
215
+ visualization=dict(type='TrackVisualizationHook', draw=False),
216
+ checkpoint = dict(type='CheckpointHook', interval=1),
217
+ )
218
+
219
+ vis_backends = [dict(type='LocalVisBackend')]
220
+ visualizer = dict(
221
+ type='MasaTrackLocalVisualizer', vis_backends=vis_backends, name='visualizer')
222
+
223
+ val_dataloader = dict(
224
+ dataset=dict(
225
+ ann_file='data/bdd/annotations/seg_track_val_cocofmt.json',
226
+ pipeline=test_pipeline,
227
+ )
228
+ )
229
+
230
+ test_dataloader = val_dataloader
231
+ val_evaluator = dict(
232
+ ann_file='data/bdd/annotations/seg_track_val_cocofmt.json',
233
+ scalabel_gt='data/bdd/annotations/scalabel_gt/seg_track_20/val/',
234
+ outfile_prefix='results/masa_results/masa-r50-release-bdd-mots-test',
235
+ metric=['TETA', 'HOTA', 'CLEAR'],
236
+ with_mask=True,
237
+ )
238
+ test_evaluator = val_evaluator
configs/masa-one/masa_r50_plug_and_play.py ADDED
@@ -0,0 +1,214 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _base_ = [
2
+ '../default_runtime.py'
3
+ ]
4
+ default_scope = 'mmdet'
5
+
6
+ model = dict(
7
+ type='MASA',
8
+ unified_backbone=False,
9
+ load_public_dets = False,
10
+ use_masa_backbone = True,
11
+ given_dets = True,
12
+ data_preprocessor=dict(
13
+ type='TrackDataPreprocessor',
14
+ # Image normalization parameters
15
+ mean=[123.675, 116.28, 103.53],
16
+ std=[58.395, 57.12, 57.375],
17
+ bgr_to_rgb=True,
18
+ # Image padding parameters
19
+ pad_mask=True, # In instance segmentation, the mask needs to be padded
20
+ pad_size_divisor=32), # Padding the image to multiples of 32
21
+ # detector=detector,
22
+ backbone=dict(
23
+ type='ResNet',
24
+ depth=50,
25
+ num_stages=4,
26
+ out_indices=(0, 1, 2, 3),
27
+ frozen_stages=-1,
28
+ norm_cfg=dict(type='SyncBN', requires_grad=True),
29
+ norm_eval=True,
30
+ style='caffe',),
31
+ masa_adapter=[
32
+ dict(
33
+ type='FPN',
34
+ in_channels=[256, 512, 1024, 2048],
35
+ out_channels=256,
36
+ norm_cfg=dict(type='SyncBN', requires_grad=True),
37
+ num_outs=5),
38
+ dict(
39
+ type='DeformFusion',
40
+ in_channels=256,
41
+ out_channels=256,
42
+ num_blocks=3)],
43
+ rpn_head=dict(
44
+ type='RPNHead',
45
+ in_channels=256,
46
+ feat_channels=256,
47
+ anchor_generator=dict(
48
+ type='AnchorGenerator',
49
+ scales=[8],
50
+ ratios=[0.5, 1.0, 2.0],
51
+ strides=[4, 8, 16, 32, 64]),
52
+ bbox_coder=dict(
53
+ type='DeltaXYWHBBoxCoder',
54
+ target_means=[.0, .0, .0, .0],
55
+ target_stds=[1.0, 1.0, 1.0, 1.0]),
56
+ loss_cls=dict(
57
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
58
+ loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
59
+ roi_head=dict(
60
+ type='StandardRoIHead',
61
+ bbox_roi_extractor=dict(
62
+ type='SingleRoIExtractor',
63
+ roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
64
+ out_channels=256,
65
+ featmap_strides=[4, 8, 16, 32]),
66
+ bbox_head=dict(
67
+ type='Shared4Conv1FCBBoxHead',
68
+ in_channels=256,
69
+ fc_out_channels=1024,
70
+ roi_feat_size=7,
71
+ num_classes=1,
72
+ norm_cfg=dict(type='SyncBN', requires_grad=True),
73
+ bbox_coder=dict(
74
+ type='DeltaXYWHBBoxCoder',
75
+ target_means=[0., 0., 0., 0.],
76
+ target_stds=[0.1, 0.1, 0.2, 0.2]),
77
+ reg_class_agnostic=True,
78
+ loss_cls=dict(
79
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
80
+ loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
81
+ # model training and testing settings
82
+ train_cfg=dict(
83
+ rpn=dict(
84
+ assigner=dict(
85
+ type='MaxIoUAssigner',
86
+ pos_iou_thr=0.7,
87
+ neg_iou_thr=0.3,
88
+ min_pos_iou=0.3,
89
+ match_low_quality=True,
90
+ ignore_iof_thr=-1),
91
+ sampler=dict(
92
+ type='RandomSampler',
93
+ num=256,
94
+ pos_fraction=0.5,
95
+ neg_pos_ub=-1,
96
+ add_gt_as_proposals=False),
97
+ allowed_border=-1,
98
+ pos_weight=-1,
99
+ debug=False),
100
+ rpn_proposal=dict(
101
+ nms_pre=2000,
102
+ max_per_img=1000,
103
+ nms=dict(type='nms', iou_threshold=0.7),
104
+ min_bbox_size=0),
105
+ rcnn=dict(
106
+ assigner=dict(
107
+ type='MaxIoUAssigner',
108
+ pos_iou_thr=0.5,
109
+ neg_iou_thr=0.5,
110
+ min_pos_iou=0.5,
111
+ match_low_quality=False,
112
+ ignore_iof_thr=-1),
113
+ sampler=dict(
114
+ type='RandomSampler',
115
+ num=512,
116
+ pos_fraction=0.25,
117
+ neg_pos_ub=-1,
118
+ add_gt_as_proposals=True),
119
+ pos_weight=-1,
120
+ debug=False)),
121
+ test_cfg=dict(
122
+ rpn=dict(
123
+ nms_pre=1000,
124
+ max_per_img=1000,
125
+ nms=dict(type='nms', iou_threshold=0.7),
126
+ min_bbox_size=0),
127
+ rcnn=dict(
128
+ score_thr=0.02,
129
+ # nms=dict(type='nms', iou_threshold=0.5),
130
+ nms=dict(type='nms',
131
+ iou_threshold=0.5,
132
+ class_agnostic=True,
133
+ split_thr=100000),
134
+ max_per_img=50,
135
+ mask_thr_binary=0.5)
136
+ # soft-nms is also supported for rcnn testing
137
+ # e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05)
138
+ ),
139
+ track_head=dict(
140
+ type='QuasiDenseTrackHead',
141
+ roi_extractor=dict(
142
+ type='SingleRoIExtractor',
143
+ roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
144
+ out_channels=256,
145
+ featmap_strides=[4, 8, 16, 32]),
146
+ embed_head=dict(
147
+ type='QuasiDenseEmbedHead',
148
+ num_convs=4,
149
+ num_fcs=1,
150
+ embed_channels=256,
151
+ norm_cfg=dict(type='GN', num_groups=32),
152
+ loss_track=dict(type='UnbiasedContrastLoss', loss_weight=0.25),
153
+ loss_track_aux=dict(
154
+ type='MarginL2Loss',
155
+ neg_pos_ub=3,
156
+ pos_margin=0,
157
+ neg_margin=0.1,
158
+ hard_mining=True,
159
+ loss_weight=1.0)),
160
+ train_cfg=dict(
161
+ assigner=dict(
162
+ type='MaxIoUAssigner',
163
+ pos_iou_thr=0.7,
164
+ neg_iou_thr=0.3,
165
+ min_pos_iou=0.5,
166
+ match_low_quality=False,
167
+ ignore_iof_thr=-1),
168
+ sampler=dict(
169
+ type='CombinedSampler',
170
+ num=512,
171
+ pos_fraction=0.5,
172
+ neg_pos_ub=3,
173
+ add_gt_as_proposals=True,
174
+ pos_sampler=dict(type='InstanceBalancedPosSampler'),
175
+ neg_sampler=dict(type='RandomSampler')))),
176
+ tracker=dict(
177
+ type='MasaTaoTracker',
178
+ init_score_thr=0.1,
179
+ obj_score_thr=0.01,
180
+ match_score_thr=0.5,
181
+ memo_tracklet_frames=10,
182
+ memo_momentum=0.8,
183
+ with_cats=False,
184
+ max_distance=100,
185
+ fps=30,
186
+ )
187
+ )
188
+
189
+ inference_pipeline = [
190
+ dict(
191
+ type='TransformBroadcaster',
192
+ transforms=[
193
+ dict(
194
+ type='Resize',
195
+ scale=(1024, 1024),
196
+ keep_ratio=True),
197
+ ]),
198
+ dict(type='PackTrackInputs')
199
+ ]
200
+
201
+ # runtime settings
202
+ train_cfg = None
203
+ val_cfg = dict(type='ValLoop')
204
+ test_cfg = dict(type='TestLoop')
205
+
206
+ default_hooks = dict(
207
+ logger=dict(type='LoggerHook', interval=50),
208
+ visualization=dict(type='TrackVisualizationHook', draw=False),
209
+ checkpoint=dict(type='CheckpointHook', interval=12),
210
+ )
211
+
212
+ vis_backends = [dict(type='LocalVisBackend')]
213
+ visualizer = dict(
214
+ type='MasaTrackLocalVisualizer', vis_backends=vis_backends, name='visualizer')
configs/masa-one/open_vocabulary_mot_test/masa_r50_open_vocabulary_test.py ADDED
@@ -0,0 +1,231 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _base_ = [
2
+ '../../default_runtime.py',
3
+ '../../datasets/tao/tao_dataset_v1.py',
4
+ ]
5
+ default_scope = 'mmdet'
6
+
7
+ model = dict(
8
+ type='MASA',
9
+ unified_backbone=False,
10
+ load_public_dets = True,
11
+ use_masa_backbone = True,
12
+ benchmark = 'tao',
13
+ public_det_path = 'results/public_dets/tao_val_dets/teta_50_internms/detic_tao_val_det/',
14
+ data_preprocessor=dict(
15
+ type='TrackDataPreprocessor',
16
+ # Image normalization parameters
17
+ mean=[123.675, 116.28, 103.53],
18
+ std=[58.395, 57.12, 57.375],
19
+ bgr_to_rgb=True,
20
+ # Image padding parameters
21
+ pad_mask=True, # In instance segmentation, the mask needs to be padded
22
+ pad_size_divisor=32), # Padding the image to multiples of 32
23
+ backbone=dict(
24
+ type='ResNet',
25
+ depth=50,
26
+ num_stages=4,
27
+ out_indices=(0, 1, 2, 3),
28
+ frozen_stages=-1,
29
+ norm_cfg=dict(type='SyncBN', requires_grad=True),
30
+ norm_eval=True,
31
+ style='caffe',),
32
+ masa_adapter=[
33
+ dict(
34
+ type='FPN',
35
+ in_channels=[256, 512, 1024, 2048],
36
+ out_channels=256,
37
+ norm_cfg=dict(type='SyncBN', requires_grad=True),
38
+ num_outs=5),
39
+ dict(
40
+ type='DeformFusion',
41
+ in_channels=256,
42
+ out_channels=256,
43
+ num_blocks=3)],
44
+ rpn_head=dict(
45
+ type='RPNHead',
46
+ in_channels=256,
47
+ feat_channels=256,
48
+ anchor_generator=dict(
49
+ type='AnchorGenerator',
50
+ scales=[8],
51
+ ratios=[0.5, 1.0, 2.0],
52
+ strides=[4, 8, 16, 32, 64]),
53
+ bbox_coder=dict(
54
+ type='DeltaXYWHBBoxCoder',
55
+ target_means=[.0, .0, .0, .0],
56
+ target_stds=[1.0, 1.0, 1.0, 1.0]),
57
+ loss_cls=dict(
58
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
59
+ loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
60
+ roi_head=dict(
61
+ type='StandardRoIHead',
62
+ bbox_roi_extractor=dict(
63
+ type='SingleRoIExtractor',
64
+ roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
65
+ out_channels=256,
66
+ featmap_strides=[4, 8, 16, 32]),
67
+ bbox_head=dict(
68
+ type='Shared4Conv1FCBBoxHead',
69
+ in_channels=256,
70
+ fc_out_channels=1024,
71
+ roi_feat_size=7,
72
+ num_classes=1,
73
+ norm_cfg=dict(type='SyncBN', requires_grad=True),
74
+ bbox_coder=dict(
75
+ type='DeltaXYWHBBoxCoder',
76
+ target_means=[0., 0., 0., 0.],
77
+ target_stds=[0.1, 0.1, 0.2, 0.2]),
78
+ reg_class_agnostic=True,
79
+ loss_cls=dict(
80
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
81
+ loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
82
+ # model training and testing settings
83
+ train_cfg=dict(
84
+ rpn=dict(
85
+ assigner=dict(
86
+ type='MaxIoUAssigner',
87
+ pos_iou_thr=0.7,
88
+ neg_iou_thr=0.3,
89
+ min_pos_iou=0.3,
90
+ match_low_quality=True,
91
+ ignore_iof_thr=-1),
92
+ sampler=dict(
93
+ type='RandomSampler',
94
+ num=256,
95
+ pos_fraction=0.5,
96
+ neg_pos_ub=-1,
97
+ add_gt_as_proposals=False),
98
+ allowed_border=-1,
99
+ pos_weight=-1,
100
+ debug=False),
101
+ rpn_proposal=dict(
102
+ nms_pre=2000,
103
+ max_per_img=1000,
104
+ nms=dict(type='nms', iou_threshold=0.7),
105
+ min_bbox_size=0),
106
+ rcnn=dict(
107
+ assigner=dict(
108
+ type='MaxIoUAssigner',
109
+ pos_iou_thr=0.5,
110
+ neg_iou_thr=0.5,
111
+ min_pos_iou=0.5,
112
+ match_low_quality=False,
113
+ ignore_iof_thr=-1),
114
+ sampler=dict(
115
+ type='RandomSampler',
116
+ num=512,
117
+ pos_fraction=0.25,
118
+ neg_pos_ub=-1,
119
+ add_gt_as_proposals=True),
120
+ pos_weight=-1,
121
+ debug=False)),
122
+ test_cfg=dict(
123
+ rpn=dict(
124
+ nms_pre=1000,
125
+ max_per_img=1000,
126
+ nms=dict(type='nms', iou_threshold=0.7),
127
+ min_bbox_size=0),
128
+ rcnn=dict(
129
+ score_thr=0.02,
130
+ # nms=dict(type='nms', iou_threshold=0.5),
131
+ nms=dict(type='nms',
132
+ iou_threshold=0.5,
133
+ class_agnostic=True,
134
+ split_thr=100000),
135
+ max_per_img=50,
136
+ mask_thr_binary=0.5)
137
+ # soft-nms is also supported for rcnn testing
138
+ # e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05)
139
+ ),
140
+ track_head=dict(
141
+ type='MasaTrackHead',
142
+ roi_extractor=dict(
143
+ type='SingleRoIExtractor',
144
+ roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
145
+ out_channels=256,
146
+ featmap_strides=[4, 8, 16, 32]),
147
+ embed_head=dict(
148
+ type='QuasiDenseEmbedHead',
149
+ num_convs=4,
150
+ num_fcs=1,
151
+ embed_channels=256,
152
+ norm_cfg=dict(type='GN', num_groups=32),
153
+ loss_track=dict(type='UnbiasedContrastLoss', loss_weight=0.25),
154
+ loss_track_aux=dict(
155
+ type='MarginL2Loss',
156
+ neg_pos_ub=3,
157
+ pos_margin=0,
158
+ neg_margin=0.1,
159
+ hard_mining=True,
160
+ loss_weight=1.0)),
161
+ train_cfg=dict(
162
+ assigner=dict(
163
+ type='MaxIoUAssigner',
164
+ pos_iou_thr=0.7,
165
+ neg_iou_thr=0.3,
166
+ min_pos_iou=0.5,
167
+ match_low_quality=False,
168
+ ignore_iof_thr=-1),
169
+ sampler=dict(
170
+ type='CombinedSampler',
171
+ num=512,
172
+ pos_fraction=0.5,
173
+ neg_pos_ub=3,
174
+ add_gt_as_proposals=True,
175
+ pos_sampler=dict(type='InstanceBalancedPosSampler'),
176
+ neg_sampler=dict(type='RandomSampler')))),
177
+ tracker=dict(
178
+ type='MasaTaoTracker',
179
+ init_score_thr=0.0001,
180
+ obj_score_thr=0.0001,
181
+ match_score_thr=0.5,
182
+ memo_tracklet_frames=10,
183
+ memo_momentum=0.8,
184
+ with_cats=False,
185
+ max_distance=-1,
186
+ fps=1,
187
+ )
188
+ )
189
+
190
+ test_pipeline = [
191
+ dict(
192
+ type='TransformBroadcaster',
193
+ transforms=[
194
+ dict(type='LoadImageFromFile'),
195
+ dict(
196
+ type='Resize',
197
+ scale=(1024, 1024),
198
+ keep_ratio=True),
199
+ dict(type='LoadTrackAnnotations')
200
+ ]),
201
+ dict(type='PackTrackInputs')
202
+ ]
203
+
204
+ # runtime settings
205
+ train_dataloader = None
206
+ train_cfg = None
207
+ val_cfg = dict(type='ValLoop')
208
+ test_cfg = dict(type='TestLoop')
209
+
210
+ default_hooks = dict(
211
+ logger=dict(type='LoggerHook', interval=50),
212
+ visualization=dict(type='TrackVisualizationHook', draw=False),
213
+ checkpoint = dict(type='CheckpointHook', interval=1),
214
+ )
215
+
216
+ vis_backends = [dict(type='LocalVisBackend')]
217
+ visualizer = dict(
218
+ type='MasaTrackLocalVisualizer', vis_backends=vis_backends, name='visualizer')
219
+
220
+ val_dataloader = dict(
221
+ dataset=dict(
222
+ ann_file='data/tao/annotations/tao_val_lvis_v1_classes.json',
223
+ pipeline=test_pipeline,
224
+ )
225
+ )
226
+ test_dataloader = val_dataloader
227
+ test_evaluator = dict(
228
+ ann_file='data/tao/annotations/tao_val_lvis_v1_classes.json',
229
+ outfile_prefix='results/masa_results/masa-r50-release-ovmot-test',
230
+ open_vocabulary=True,
231
+ )
configs/masa-one/tao_teta_test/masa_r50_tao_test_detic_dets.py ADDED
@@ -0,0 +1,230 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _base_ = [
2
+ '../../default_runtime.py',
3
+ '../../datasets/tao/tao_dataset_v1.py',
4
+ ]
5
+ default_scope = 'mmdet'
6
+
7
+ model = dict(
8
+ type='MASA',
9
+ unified_backbone=False,
10
+ load_public_dets = True,
11
+ use_masa_backbone = True,
12
+ benchmark = 'tao',
13
+ public_det_path = 'results/public_dets/tao_val_dets/teta_50_internms/detic_tao_val_det/',
14
+ data_preprocessor=dict(
15
+ type='TrackDataPreprocessor',
16
+ # Image normalization parameters
17
+ mean=[123.675, 116.28, 103.53],
18
+ std=[58.395, 57.12, 57.375],
19
+ bgr_to_rgb=True,
20
+ # Image padding parameters
21
+ pad_mask=True, # In instance segmentation, the mask needs to be padded
22
+ pad_size_divisor=32), # Padding the image to multiples of 32
23
+ backbone=dict(
24
+ type='ResNet',
25
+ depth=50,
26
+ num_stages=4,
27
+ out_indices=(0, 1, 2, 3),
28
+ frozen_stages=-1,
29
+ norm_cfg=dict(type='SyncBN', requires_grad=True),
30
+ norm_eval=True,
31
+ style='caffe',),
32
+ masa_adapter=[
33
+ dict(
34
+ type='FPN',
35
+ in_channels=[256, 512, 1024, 2048],
36
+ out_channels=256,
37
+ norm_cfg=dict(type='SyncBN', requires_grad=True),
38
+ num_outs=5),
39
+ dict(
40
+ type='DeformFusion',
41
+ in_channels=256,
42
+ out_channels=256,
43
+ num_blocks=3)],
44
+ rpn_head=dict(
45
+ type='RPNHead',
46
+ in_channels=256,
47
+ feat_channels=256,
48
+ anchor_generator=dict(
49
+ type='AnchorGenerator',
50
+ scales=[8],
51
+ ratios=[0.5, 1.0, 2.0],
52
+ strides=[4, 8, 16, 32, 64]),
53
+ bbox_coder=dict(
54
+ type='DeltaXYWHBBoxCoder',
55
+ target_means=[.0, .0, .0, .0],
56
+ target_stds=[1.0, 1.0, 1.0, 1.0]),
57
+ loss_cls=dict(
58
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
59
+ loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
60
+ roi_head=dict(
61
+ type='StandardRoIHead',
62
+ bbox_roi_extractor=dict(
63
+ type='SingleRoIExtractor',
64
+ roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
65
+ out_channels=256,
66
+ featmap_strides=[4, 8, 16, 32]),
67
+ bbox_head=dict(
68
+ type='Shared4Conv1FCBBoxHead',
69
+ in_channels=256,
70
+ fc_out_channels=1024,
71
+ roi_feat_size=7,
72
+ num_classes=1,
73
+ norm_cfg=dict(type='SyncBN', requires_grad=True),
74
+ bbox_coder=dict(
75
+ type='DeltaXYWHBBoxCoder',
76
+ target_means=[0., 0., 0., 0.],
77
+ target_stds=[0.1, 0.1, 0.2, 0.2]),
78
+ reg_class_agnostic=True,
79
+ loss_cls=dict(
80
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
81
+ loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
82
+ # model training and testing settings
83
+ train_cfg=dict(
84
+ rpn=dict(
85
+ assigner=dict(
86
+ type='MaxIoUAssigner',
87
+ pos_iou_thr=0.7,
88
+ neg_iou_thr=0.3,
89
+ min_pos_iou=0.3,
90
+ match_low_quality=True,
91
+ ignore_iof_thr=-1),
92
+ sampler=dict(
93
+ type='RandomSampler',
94
+ num=256,
95
+ pos_fraction=0.5,
96
+ neg_pos_ub=-1,
97
+ add_gt_as_proposals=False),
98
+ allowed_border=-1,
99
+ pos_weight=-1,
100
+ debug=False),
101
+ rpn_proposal=dict(
102
+ nms_pre=2000,
103
+ max_per_img=1000,
104
+ nms=dict(type='nms', iou_threshold=0.7),
105
+ min_bbox_size=0),
106
+ rcnn=dict(
107
+ assigner=dict(
108
+ type='MaxIoUAssigner',
109
+ pos_iou_thr=0.5,
110
+ neg_iou_thr=0.5,
111
+ min_pos_iou=0.5,
112
+ match_low_quality=False,
113
+ ignore_iof_thr=-1),
114
+ sampler=dict(
115
+ type='RandomSampler',
116
+ num=512,
117
+ pos_fraction=0.25,
118
+ neg_pos_ub=-1,
119
+ add_gt_as_proposals=True),
120
+ pos_weight=-1,
121
+ debug=False)),
122
+ test_cfg=dict(
123
+ rpn=dict(
124
+ nms_pre=1000,
125
+ max_per_img=1000,
126
+ nms=dict(type='nms', iou_threshold=0.7),
127
+ min_bbox_size=0),
128
+ rcnn=dict(
129
+ score_thr=0.02,
130
+ # nms=dict(type='nms', iou_threshold=0.5),
131
+ nms=dict(type='nms',
132
+ iou_threshold=0.5,
133
+ class_agnostic=True,
134
+ split_thr=100000),
135
+ max_per_img=50,
136
+ mask_thr_binary=0.5)
137
+ # soft-nms is also supported for rcnn testing
138
+ # e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05)
139
+ ),
140
+ track_head=dict(
141
+ type='MasaTrackHead',
142
+ roi_extractor=dict(
143
+ type='SingleRoIExtractor',
144
+ roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
145
+ out_channels=256,
146
+ featmap_strides=[4, 8, 16, 32]),
147
+ embed_head=dict(
148
+ type='QuasiDenseEmbedHead',
149
+ num_convs=4,
150
+ num_fcs=1,
151
+ embed_channels=256,
152
+ norm_cfg=dict(type='GN', num_groups=32),
153
+ loss_track=dict(type='UnbiasedContrastLoss', loss_weight=0.25),
154
+ loss_track_aux=dict(
155
+ type='MarginL2Loss',
156
+ neg_pos_ub=3,
157
+ pos_margin=0,
158
+ neg_margin=0.1,
159
+ hard_mining=True,
160
+ loss_weight=1.0)),
161
+ train_cfg=dict(
162
+ assigner=dict(
163
+ type='MaxIoUAssigner',
164
+ pos_iou_thr=0.7,
165
+ neg_iou_thr=0.3,
166
+ min_pos_iou=0.5,
167
+ match_low_quality=False,
168
+ ignore_iof_thr=-1),
169
+ sampler=dict(
170
+ type='CombinedSampler',
171
+ num=512,
172
+ pos_fraction=0.5,
173
+ neg_pos_ub=3,
174
+ add_gt_as_proposals=True,
175
+ pos_sampler=dict(type='InstanceBalancedPosSampler'),
176
+ neg_sampler=dict(type='RandomSampler')))),
177
+ tracker=dict(
178
+ type='MasaTaoTracker',
179
+ init_score_thr=0.0001,
180
+ obj_score_thr=0.0001,
181
+ match_score_thr=0.5,
182
+ memo_tracklet_frames=10,
183
+ memo_momentum=0.8,
184
+ with_cats=False,
185
+ max_distance=-1,
186
+ fps=1,
187
+ )
188
+ )
189
+
190
+ test_pipeline = [
191
+ dict(
192
+ type='TransformBroadcaster',
193
+ transforms=[
194
+ dict(type='LoadImageFromFile'),
195
+ dict(
196
+ type='Resize',
197
+ scale=(1024, 1024),
198
+ keep_ratio=True),
199
+ dict(type='LoadTrackAnnotations')
200
+ ]),
201
+ dict(type='PackTrackInputs')
202
+ ]
203
+
204
+ # runtime settings
205
+ train_dataloader = None
206
+ train_cfg = None
207
+ val_cfg = dict(type='ValLoop')
208
+ test_cfg = dict(type='TestLoop')
209
+
210
+ default_hooks = dict(
211
+ logger=dict(type='LoggerHook', interval=50),
212
+ visualization=dict(type='TrackVisualizationHook', draw=False),
213
+ checkpoint = dict(type='CheckpointHook', interval=1),
214
+ )
215
+
216
+ vis_backends = [dict(type='LocalVisBackend')]
217
+ visualizer = dict(
218
+ type='MasaTrackLocalVisualizer', vis_backends=vis_backends, name='visualizer')
219
+
220
+ val_dataloader = dict(
221
+ dataset=dict(
222
+ ann_file='data/tao/annotations/tao_val_lvis_v1_classes.json',
223
+ pipeline=test_pipeline,
224
+ )
225
+ )
226
+ test_dataloader = val_dataloader
227
+ test_evaluator = dict(
228
+ ann_file='data/tao/annotations/tao_val_lvis_v1_classes.json',
229
+ outfile_prefix='results/masa_results/masa-r50-release_detic_dets-test',
230
+ )
configs/masa-one/tao_teta_test/masa_r50_tao_test_teter_swinT_dets.py ADDED
@@ -0,0 +1,230 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _base_ = [
2
+ '../../default_runtime.py',
3
+ '../../datasets/tao/tao_dataset_v05.py',
4
+ ]
5
+ default_scope = 'mmdet'
6
+
7
+ model = dict(
8
+ type='MASA',
9
+ unified_backbone=False,
10
+ load_public_dets = True,
11
+ use_masa_backbone = True,
12
+ benchmark = 'tao',
13
+ public_det_path = 'results/public_dets/tao_val_dets/teta_50_internms/teter_swinT_tao_val_internms_50/',
14
+ data_preprocessor=dict(
15
+ type='TrackDataPreprocessor',
16
+ # Image normalization parameters
17
+ mean=[123.675, 116.28, 103.53],
18
+ std=[58.395, 57.12, 57.375],
19
+ bgr_to_rgb=True,
20
+ # Image padding parameters
21
+ pad_mask=True, # In instance segmentation, the mask needs to be padded
22
+ pad_size_divisor=32), # Padding the image to multiples of 32
23
+ backbone=dict(
24
+ type='ResNet',
25
+ depth=50,
26
+ num_stages=4,
27
+ out_indices=(0, 1, 2, 3),
28
+ frozen_stages=-1,
29
+ norm_cfg=dict(type='SyncBN', requires_grad=True),
30
+ norm_eval=True,
31
+ style='caffe',),
32
+ masa_adapter=[
33
+ dict(
34
+ type='FPN',
35
+ in_channels=[256, 512, 1024, 2048],
36
+ out_channels=256,
37
+ norm_cfg=dict(type='SyncBN', requires_grad=True),
38
+ num_outs=5),
39
+ dict(
40
+ type='DeformFusion',
41
+ in_channels=256,
42
+ out_channels=256,
43
+ num_blocks=3)],
44
+ rpn_head=dict(
45
+ type='RPNHead',
46
+ in_channels=256,
47
+ feat_channels=256,
48
+ anchor_generator=dict(
49
+ type='AnchorGenerator',
50
+ scales=[8],
51
+ ratios=[0.5, 1.0, 2.0],
52
+ strides=[4, 8, 16, 32, 64]),
53
+ bbox_coder=dict(
54
+ type='DeltaXYWHBBoxCoder',
55
+ target_means=[.0, .0, .0, .0],
56
+ target_stds=[1.0, 1.0, 1.0, 1.0]),
57
+ loss_cls=dict(
58
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
59
+ loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
60
+ roi_head=dict(
61
+ type='StandardRoIHead',
62
+ bbox_roi_extractor=dict(
63
+ type='SingleRoIExtractor',
64
+ roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
65
+ out_channels=256,
66
+ featmap_strides=[4, 8, 16, 32]),
67
+ bbox_head=dict(
68
+ type='Shared4Conv1FCBBoxHead',
69
+ in_channels=256,
70
+ fc_out_channels=1024,
71
+ roi_feat_size=7,
72
+ num_classes=1,
73
+ norm_cfg=dict(type='SyncBN', requires_grad=True),
74
+ bbox_coder=dict(
75
+ type='DeltaXYWHBBoxCoder',
76
+ target_means=[0., 0., 0., 0.],
77
+ target_stds=[0.1, 0.1, 0.2, 0.2]),
78
+ reg_class_agnostic=True,
79
+ loss_cls=dict(
80
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
81
+ loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
82
+ # model training and testing settings
83
+ train_cfg=dict(
84
+ rpn=dict(
85
+ assigner=dict(
86
+ type='MaxIoUAssigner',
87
+ pos_iou_thr=0.7,
88
+ neg_iou_thr=0.3,
89
+ min_pos_iou=0.3,
90
+ match_low_quality=True,
91
+ ignore_iof_thr=-1),
92
+ sampler=dict(
93
+ type='RandomSampler',
94
+ num=256,
95
+ pos_fraction=0.5,
96
+ neg_pos_ub=-1,
97
+ add_gt_as_proposals=False),
98
+ allowed_border=-1,
99
+ pos_weight=-1,
100
+ debug=False),
101
+ rpn_proposal=dict(
102
+ nms_pre=2000,
103
+ max_per_img=1000,
104
+ nms=dict(type='nms', iou_threshold=0.7),
105
+ min_bbox_size=0),
106
+ rcnn=dict(
107
+ assigner=dict(
108
+ type='MaxIoUAssigner',
109
+ pos_iou_thr=0.5,
110
+ neg_iou_thr=0.5,
111
+ min_pos_iou=0.5,
112
+ match_low_quality=False,
113
+ ignore_iof_thr=-1),
114
+ sampler=dict(
115
+ type='RandomSampler',
116
+ num=512,
117
+ pos_fraction=0.25,
118
+ neg_pos_ub=-1,
119
+ add_gt_as_proposals=True),
120
+ pos_weight=-1,
121
+ debug=False)),
122
+ test_cfg=dict(
123
+ rpn=dict(
124
+ nms_pre=1000,
125
+ max_per_img=1000,
126
+ nms=dict(type='nms', iou_threshold=0.7),
127
+ min_bbox_size=0),
128
+ rcnn=dict(
129
+ score_thr=0.02,
130
+ # nms=dict(type='nms', iou_threshold=0.5),
131
+ nms=dict(type='nms',
132
+ iou_threshold=0.5,
133
+ class_agnostic=True,
134
+ split_thr=100000),
135
+ max_per_img=50,
136
+ mask_thr_binary=0.5)
137
+ # soft-nms is also supported for rcnn testing
138
+ # e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05)
139
+ ),
140
+ track_head=dict(
141
+ type='MasaTrackHead',
142
+ roi_extractor=dict(
143
+ type='SingleRoIExtractor',
144
+ roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
145
+ out_channels=256,
146
+ featmap_strides=[4, 8, 16, 32]),
147
+ embed_head=dict(
148
+ type='QuasiDenseEmbedHead',
149
+ num_convs=4,
150
+ num_fcs=1,
151
+ embed_channels=256,
152
+ norm_cfg=dict(type='GN', num_groups=32),
153
+ loss_track=dict(type='UnbiasedContrastLoss', loss_weight=0.25),
154
+ loss_track_aux=dict(
155
+ type='MarginL2Loss',
156
+ neg_pos_ub=3,
157
+ pos_margin=0,
158
+ neg_margin=0.1,
159
+ hard_mining=True,
160
+ loss_weight=1.0)),
161
+ train_cfg=dict(
162
+ assigner=dict(
163
+ type='MaxIoUAssigner',
164
+ pos_iou_thr=0.7,
165
+ neg_iou_thr=0.3,
166
+ min_pos_iou=0.5,
167
+ match_low_quality=False,
168
+ ignore_iof_thr=-1),
169
+ sampler=dict(
170
+ type='CombinedSampler',
171
+ num=512,
172
+ pos_fraction=0.5,
173
+ neg_pos_ub=3,
174
+ add_gt_as_proposals=True,
175
+ pos_sampler=dict(type='InstanceBalancedPosSampler'),
176
+ neg_sampler=dict(type='RandomSampler')))),
177
+ tracker=dict(
178
+ type='MasaTaoTracker',
179
+ init_score_thr=0.0001,
180
+ obj_score_thr=0.0001,
181
+ match_score_thr=0.5,
182
+ memo_tracklet_frames=10,
183
+ memo_momentum=0.8,
184
+ with_cats=False,
185
+ max_distance=-1,
186
+ fps=1,
187
+ )
188
+ )
189
+
190
+ test_pipeline = [
191
+ dict(
192
+ type='TransformBroadcaster',
193
+ transforms=[
194
+ dict(type='LoadImageFromFile'),
195
+ dict(
196
+ type='Resize',
197
+ scale=(1024, 1024),
198
+ keep_ratio=True),
199
+ dict(type='LoadTrackAnnotations')
200
+ ]),
201
+ dict(type='PackTrackInputs')
202
+ ]
203
+
204
+ # runtime settings
205
+ train_dataloader = None
206
+ train_cfg = None
207
+ val_cfg = dict(type='ValLoop')
208
+ test_cfg = dict(type='TestLoop')
209
+
210
+ default_hooks = dict(
211
+ logger=dict(type='LoggerHook', interval=50),
212
+ visualization=dict(type='TrackVisualizationHook', draw=False),
213
+ checkpoint = dict(type='CheckpointHook', interval=1),
214
+ )
215
+
216
+ vis_backends = [dict(type='LocalVisBackend')]
217
+ visualizer = dict(
218
+ type='MasaTrackLocalVisualizer', vis_backends=vis_backends, name='visualizer')
219
+
220
+ val_dataloader = dict(
221
+ dataset=dict(
222
+ ann_file='data/tao/annotations/tao_val_lvis_v05_classes.json',
223
+ pipeline=test_pipeline,
224
+ )
225
+ )
226
+ test_dataloader = val_dataloader
227
+ test_evaluator = dict(
228
+ ann_file='data/tao/annotations/tao_val_lvis_v05_classes.json',
229
+ outfile_prefix='results/masa_results/masa-r50-release-tao-teter-test',
230
+ )
configs/masa-sam/bdd_test/masa_sam_vitb_bdd_mot_test.py ADDED
@@ -0,0 +1,245 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _base_ = [
2
+ '../sam-vitb.py',
3
+ '../../datasets/bdd/bdd_dataset.py',
4
+ '../../default_runtime.py'
5
+ ]
6
+ default_scope = 'mmdet'
7
+ detector = _base_.model
8
+ detector['init_cfg'] = dict(
9
+ type='Pretrained',
10
+ checkpoint= 'saved_models/pretrain_weights/sam_vit_b_01ec64_mmdet.pth'
11
+ # noqa: E501
12
+ )
13
+ detector['type'] = 'SamMasa'
14
+
15
+ del _base_.model
16
+
17
+ model = dict(
18
+ type='MASA',
19
+ freeze_detector=True,
20
+ unified_backbone=True,
21
+ load_public_dets = True,
22
+ benchmark = 'bdd',
23
+ public_det_path = 'results/public_dets/bdd_mot_yolox_dets/',
24
+ data_preprocessor=dict(
25
+ type='TrackDataPreprocessor',
26
+ # Image normalization parameters
27
+ mean=[123.675, 116.28, 103.53],
28
+ std=[58.395, 57.12, 57.375],
29
+ bgr_to_rgb=True,
30
+ # Image padding parameters
31
+ pad_mask=False, # In instance segmentation, the mask needs to be padded
32
+ pad_size_divisor=1024), # Padding the image to multiples of 32
33
+ detector=detector,
34
+ masa_adapter=[
35
+ dict(
36
+ type='SimpleFPN',
37
+ in_channels=[768, 768, 768, 768],
38
+ out_channels=256,
39
+ use_residual=True,
40
+ num_outs=5),
41
+ dict(
42
+ type='DyHead',
43
+ in_channels=256,
44
+ out_channels=256,
45
+ num_blocks=3)
46
+ ],
47
+ rpn_head=dict(
48
+ type='RPNHead',
49
+ in_channels=256,
50
+ feat_channels=256,
51
+ anchor_generator=dict(
52
+ type='AnchorGenerator',
53
+ scales=[8],
54
+ ratios=[0.5, 1.0, 2.0],
55
+ strides=[4, 8, 16, 32, 64]),
56
+ bbox_coder=dict(
57
+ type='DeltaXYWHBBoxCoder',
58
+ target_means=[.0, .0, .0, .0],
59
+ target_stds=[1.0, 1.0, 1.0, 1.0]),
60
+ loss_cls=dict(
61
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
62
+ loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)
63
+ ),
64
+ roi_head=dict(
65
+ type='StandardRoIHead',
66
+ bbox_roi_extractor=dict(
67
+ type='SingleRoIExtractor',
68
+ roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
69
+ out_channels=256,
70
+ featmap_strides=[4, 8, 16, 32]),
71
+ bbox_head=dict(
72
+ type='Shared2FCBBoxHead',
73
+ in_channels=256,
74
+ fc_out_channels=1024,
75
+ roi_feat_size=7,
76
+ num_classes=1,
77
+ bbox_coder=dict(
78
+ type='DeltaXYWHBBoxCoder',
79
+ target_means=[0., 0., 0., 0.],
80
+ target_stds=[0.1, 0.1, 0.2, 0.2]),
81
+ reg_class_agnostic=True,
82
+ loss_cls=dict(
83
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
84
+ loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
85
+ # model training and testing settings
86
+ train_cfg=dict(
87
+ rpn=dict(
88
+ assigner=dict(
89
+ type='MaxIoUAssigner',
90
+ pos_iou_thr=0.7,
91
+ neg_iou_thr=0.3,
92
+ min_pos_iou=0.3,
93
+ match_low_quality=True,
94
+ ignore_iof_thr=-1),
95
+ sampler=dict(
96
+ type='RandomSampler',
97
+ num=256,
98
+ pos_fraction=0.5,
99
+ neg_pos_ub=-1,
100
+ add_gt_as_proposals=False),
101
+ allowed_border=-1,
102
+ pos_weight=-1,
103
+ debug=False),
104
+ rpn_proposal=dict(
105
+ nms_pre=2000,
106
+ max_per_img=1000,
107
+ nms=dict(type='nms', iou_threshold=0.7),
108
+ min_bbox_size=0),
109
+ rcnn=dict(
110
+ assigner=dict(
111
+ type='MaxIoUAssigner',
112
+ pos_iou_thr=0.5,
113
+ neg_iou_thr=0.5,
114
+ min_pos_iou=0.5,
115
+ match_low_quality=False,
116
+ ignore_iof_thr=-1),
117
+ sampler=dict(
118
+ type='RandomSampler',
119
+ num=512,
120
+ pos_fraction=0.25,
121
+ neg_pos_ub=-1,
122
+ add_gt_as_proposals=True),
123
+ pos_weight=-1,
124
+ debug=False)),
125
+ test_cfg=dict(
126
+ rpn=dict(
127
+ nms_pre=1000,
128
+ max_per_img=1000,
129
+ nms=dict(type='nms', iou_threshold=0.7),
130
+ min_bbox_size=0),
131
+ rcnn=dict(
132
+ score_thr=0.02,
133
+ # nms=dict(type='nms', iou_threshold=0.5),
134
+ nms=dict(type='nms',
135
+ iou_threshold=0.5,
136
+ class_agnostic=True,
137
+ split_thr=100000),
138
+ max_per_img=50,
139
+ mask_thr_binary=0.5)
140
+ # soft-nms is also supported for rcnn testing
141
+ # e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05)
142
+ ),
143
+ track_head=dict(
144
+ type='MasaTrackHead',
145
+ roi_extractor=dict(
146
+ type='SingleRoIExtractor',
147
+ roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
148
+ out_channels=256,
149
+ featmap_strides=[4, 8, 16, 32]),
150
+ embed_head=dict(
151
+ type='QuasiDenseEmbedHead',
152
+ num_convs=4,
153
+ num_fcs=1,
154
+ embed_channels=256,
155
+ norm_cfg=dict(type='GN', num_groups=32),
156
+ loss_track=dict(type='UnbiasedContrastLoss', loss_weight=0.25),
157
+ loss_track_aux=dict(
158
+ type='MarginL2Loss',
159
+ neg_pos_ub=3,
160
+ pos_margin=0,
161
+ neg_margin=0.1,
162
+ hard_mining=True,
163
+ loss_weight=1.0)),
164
+ train_cfg=dict(
165
+ assigner=dict(
166
+ type='MaxIoUAssigner',
167
+ pos_iou_thr=0.7,
168
+ neg_iou_thr=0.3,
169
+ min_pos_iou=0.5,
170
+ match_low_quality=False,
171
+ ignore_iof_thr=-1),
172
+ sampler=dict(
173
+ type='CombinedSampler',
174
+ num=512,
175
+ pos_fraction=0.5,
176
+ neg_pos_ub=3,
177
+ add_gt_as_proposals=True,
178
+ pos_sampler=dict(type='InstanceBalancedPosSampler'),
179
+ neg_sampler=dict(type='RandomSampler')))),
180
+ tracker=dict(
181
+ type='MasaBDDTracker',
182
+ init_score_thr=0.5,
183
+ obj_score_thr=0.3,
184
+ match_score_thr=0.6,
185
+ memo_tracklet_frames=10,
186
+ memo_backdrop_frames=1,
187
+ memo_momentum=0.8,
188
+ nms_conf_thr=0.5,
189
+ nms_backdrop_iou_thr=0.3,
190
+ nms_class_iou_thr=0.7,
191
+ with_cats=False,
192
+ match_metric='bisoftmax')
193
+ )
194
+
195
+ test_pipeline = [
196
+ dict(
197
+ type='TransformBroadcaster',
198
+ transforms=[
199
+ dict(type='LoadImageFromFile'),
200
+ dict(
201
+ type='Resize',
202
+ scale=(1024, 1024),
203
+ keep_ratio=True),
204
+ dict(type='LoadTrackAnnotations')
205
+ ]),
206
+ dict(type='PackTrackInputs')
207
+ ]
208
+
209
+ # runtime settings
210
+ train_dataloader = None
211
+ train_cfg = None
212
+ val_cfg = dict(type='ValLoop')
213
+ test_cfg = dict(type='TestLoop')
214
+
215
+ default_hooks = dict(
216
+ logger=dict(type='LoggerHook', interval=50),
217
+ visualization=dict(type='TrackVisualizationHook', draw=False),
218
+ checkpoint=dict(type='CheckpointHook', interval=12),
219
+ )
220
+
221
+ vis_backends = [dict(type='LocalVisBackend')]
222
+ visualizer = dict(
223
+ type='MasaTrackLocalVisualizer', vis_backends=vis_backends, name='visualizer')
224
+
225
+ # custom hooks
226
+ custom_hooks = [
227
+ # Synchronize model buffers such as running_mean and running_var in BN
228
+ # at the end of each epoch
229
+ dict(type='SyncBuffersHook')
230
+ ]
231
+ auto_scale_lr = dict(enable=False, base_batch_size=16)
232
+ val_dataloader = dict(
233
+ dataset=dict(
234
+ ann_file='data/bdd/annotations/box_track_20/box_track_val_cocofmt.json',
235
+ pipeline=test_pipeline,
236
+ )
237
+ )
238
+ test_dataloader = val_dataloader
239
+ val_evaluator = dict(
240
+ ann_file='data/bdd/annotations/box_track_20/box_track_val_cocofmt.json',
241
+ scalabel_gt='data/bdd/annotations/scalabel_gt/box_track_20/val/',
242
+ outfile_prefix='results/masa_results/masa-sam-vitb-bdd-mot-test',
243
+ metric=['TETA', 'HOTA', 'CLEAR']
244
+ )
245
+ test_evaluator = val_evaluator
configs/masa-sam/bdd_test/masa_sam_vitb_bdd_mots_test.py ADDED
@@ -0,0 +1,241 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _base_ = [
2
+ '../sam-vitb.py',
3
+ '../../datasets/bdd/bdd_dataset.py',
4
+ '../../default_runtime.py'
5
+ ]
6
+ default_scope = 'mmdet'
7
+ detector = _base_.model
8
+ detector['init_cfg'] = dict(
9
+ type='Pretrained',
10
+ checkpoint= 'saved_models/pretrain_weights/sam_vit_b_01ec64_mmdet.pth'
11
+ # noqa: E501
12
+ )
13
+ detector['type'] = 'SamMasa'
14
+
15
+ del _base_.model
16
+
17
+ model = dict(
18
+ type='MASA',
19
+ freeze_detector=True,
20
+ unified_backbone=True,
21
+ load_public_dets = True,
22
+ with_segm=True,
23
+ benchmark = 'bdd',
24
+ public_det_path = 'results/public_dets/bdd_mots_val_uninext_dets/',
25
+ data_preprocessor=dict(
26
+ type='TrackDataPreprocessor',
27
+ # Image normalization parameters
28
+ mean=[123.675, 116.28, 103.53],
29
+ std=[58.395, 57.12, 57.375],
30
+ bgr_to_rgb=True,
31
+ # Image padding parameters
32
+ pad_mask=False, # In instance segmentation, the mask needs to be padded
33
+ pad_size_divisor=1024), # Padding the image to multiples of 32
34
+ detector=detector,
35
+ masa_adapter=[
36
+ dict(
37
+ type='SimpleFPN',
38
+ in_channels=[768, 768, 768, 768],
39
+ out_channels=256,
40
+ use_residual=True,
41
+ num_outs=5),
42
+ dict(
43
+ type='DyHead',
44
+ in_channels=256,
45
+ out_channels=256,
46
+ num_blocks=3)
47
+ ],
48
+ rpn_head=dict(
49
+ type='RPNHead',
50
+ in_channels=256,
51
+ feat_channels=256,
52
+ anchor_generator=dict(
53
+ type='AnchorGenerator',
54
+ scales=[8],
55
+ ratios=[0.5, 1.0, 2.0],
56
+ strides=[4, 8, 16, 32, 64]),
57
+ bbox_coder=dict(
58
+ type='DeltaXYWHBBoxCoder',
59
+ target_means=[.0, .0, .0, .0],
60
+ target_stds=[1.0, 1.0, 1.0, 1.0]),
61
+ loss_cls=dict(
62
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
63
+ loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)
64
+ ),
65
+ roi_head=dict(
66
+ type='StandardRoIHead',
67
+ bbox_roi_extractor=dict(
68
+ type='SingleRoIExtractor',
69
+ roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
70
+ out_channels=256,
71
+ featmap_strides=[4, 8, 16, 32]),
72
+ bbox_head=dict(
73
+ type='Shared2FCBBoxHead',
74
+ in_channels=256,
75
+ fc_out_channels=1024,
76
+ roi_feat_size=7,
77
+ num_classes=1,
78
+ bbox_coder=dict(
79
+ type='DeltaXYWHBBoxCoder',
80
+ target_means=[0., 0., 0., 0.],
81
+ target_stds=[0.1, 0.1, 0.2, 0.2]),
82
+ reg_class_agnostic=True,
83
+ loss_cls=dict(
84
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
85
+ loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
86
+ # model training and testing settings
87
+ train_cfg=dict(
88
+ rpn=dict(
89
+ assigner=dict(
90
+ type='MaxIoUAssigner',
91
+ pos_iou_thr=0.7,
92
+ neg_iou_thr=0.3,
93
+ min_pos_iou=0.3,
94
+ match_low_quality=True,
95
+ ignore_iof_thr=-1),
96
+ sampler=dict(
97
+ type='RandomSampler',
98
+ num=256,
99
+ pos_fraction=0.5,
100
+ neg_pos_ub=-1,
101
+ add_gt_as_proposals=False),
102
+ allowed_border=-1,
103
+ pos_weight=-1,
104
+ debug=False),
105
+ rpn_proposal=dict(
106
+ nms_pre=2000,
107
+ max_per_img=1000,
108
+ nms=dict(type='nms', iou_threshold=0.7),
109
+ min_bbox_size=0),
110
+ rcnn=dict(
111
+ assigner=dict(
112
+ type='MaxIoUAssigner',
113
+ pos_iou_thr=0.5,
114
+ neg_iou_thr=0.5,
115
+ min_pos_iou=0.5,
116
+ match_low_quality=False,
117
+ ignore_iof_thr=-1),
118
+ sampler=dict(
119
+ type='RandomSampler',
120
+ num=512,
121
+ pos_fraction=0.25,
122
+ neg_pos_ub=-1,
123
+ add_gt_as_proposals=True),
124
+ pos_weight=-1,
125
+ debug=False)),
126
+ test_cfg=dict(
127
+ rpn=dict(
128
+ nms_pre=1000,
129
+ max_per_img=1000,
130
+ nms=dict(type='nms', iou_threshold=0.7),
131
+ min_bbox_size=0),
132
+ rcnn=dict(
133
+ score_thr=0.02,
134
+ # nms=dict(type='nms', iou_threshold=0.5),
135
+ nms=dict(type='nms',
136
+ iou_threshold=0.5,
137
+ class_agnostic=True,
138
+ split_thr=100000),
139
+ max_per_img=50,
140
+ mask_thr_binary=0.5)
141
+ # soft-nms is also supported for rcnn testing
142
+ # e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05)
143
+ ),
144
+ track_head=dict(
145
+ type='MasaTrackHead',
146
+ roi_extractor=dict(
147
+ type='SingleRoIExtractor',
148
+ roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
149
+ out_channels=256,
150
+ featmap_strides=[4, 8, 16, 32]),
151
+ embed_head=dict(
152
+ type='QuasiDenseEmbedHead',
153
+ num_convs=4,
154
+ num_fcs=1,
155
+ embed_channels=256,
156
+ norm_cfg=dict(type='GN', num_groups=32),
157
+ loss_track=dict(type='UnbiasedContrastLoss', loss_weight=0.25),
158
+ loss_track_aux=dict(
159
+ type='MarginL2Loss',
160
+ neg_pos_ub=3,
161
+ pos_margin=0,
162
+ neg_margin=0.1,
163
+ hard_mining=True,
164
+ loss_weight=1.0)),
165
+ train_cfg=dict(
166
+ assigner=dict(
167
+ type='MaxIoUAssigner',
168
+ pos_iou_thr=0.7,
169
+ neg_iou_thr=0.3,
170
+ min_pos_iou=0.5,
171
+ match_low_quality=False,
172
+ ignore_iof_thr=-1),
173
+ sampler=dict(
174
+ type='CombinedSampler',
175
+ num=512,
176
+ pos_fraction=0.5,
177
+ neg_pos_ub=3,
178
+ add_gt_as_proposals=True,
179
+ pos_sampler=dict(type='InstanceBalancedPosSampler'),
180
+ neg_sampler=dict(type='RandomSampler')))),
181
+ tracker=dict(
182
+ type='MasaBDDTracker',
183
+ init_score_thr=0.5,
184
+ obj_score_thr=0.3,
185
+ match_score_thr=0.6,
186
+ memo_tracklet_frames=10,
187
+ memo_backdrop_frames=1,
188
+ memo_momentum=0.8,
189
+ nms_conf_thr=0.5,
190
+ nms_backdrop_iou_thr=0.3,
191
+ nms_class_iou_thr=0.7,
192
+ with_cats=False,
193
+ match_metric='bisoftmax')
194
+ )
195
+
196
+ test_pipeline = [
197
+ dict(
198
+ type='TransformBroadcaster',
199
+ transforms=[
200
+ dict(type='LoadImageFromFile'),
201
+ dict(
202
+ type='Resize',
203
+ scale=(1024, 1024),
204
+ keep_ratio=True),
205
+ dict(type='LoadTrackAnnotations')
206
+ ]),
207
+ dict(type='PackTrackInputs')
208
+ ]
209
+
210
+ # runtime settings
211
+ train_dataloader = None
212
+ train_cfg = None
213
+ val_cfg = dict(type='ValLoop')
214
+ test_cfg = dict(type='TestLoop')
215
+
216
+ default_hooks = dict(
217
+ logger=dict(type='LoggerHook', interval=50),
218
+ visualization=dict(type='TrackVisualizationHook', draw=False),
219
+ checkpoint = dict(type='CheckpointHook', interval=1),
220
+ )
221
+
222
+ vis_backends = [dict(type='LocalVisBackend')]
223
+ visualizer = dict(
224
+ type='MasaTrackLocalVisualizer', vis_backends=vis_backends, name='visualizer')
225
+
226
+ val_dataloader = dict(
227
+ dataset=dict(
228
+ ann_file='data/bdd/annotations/seg_track_val_cocofmt.json',
229
+ pipeline=test_pipeline,
230
+ )
231
+ )
232
+
233
+ test_dataloader = val_dataloader
234
+ val_evaluator = dict(
235
+ ann_file='data/bdd/annotations/seg_track_val_cocofmt.json',
236
+ scalabel_gt='data/bdd/annotations/scalabel_gt/seg_track_20/val/',
237
+ outfile_prefix='results/masa_results/masa-sam-vitb-bdd-mots-test',
238
+ metric=['TETA', 'HOTA', 'CLEAR'],
239
+ with_mask=True,
240
+ )
241
+ test_evaluator = val_evaluator
configs/masa-sam/bdd_test/masa_sam_vith_bdd_mot_test.py ADDED
@@ -0,0 +1,246 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _base_ = [
2
+ '../sam-vith.py',
3
+ '../../datasets/bdd/bdd_dataset.py',
4
+ '../../default_runtime.py'
5
+ ]
6
+ default_scope = 'mmdet'
7
+ detector = _base_.model
8
+ detector['init_cfg'] = dict(
9
+ type='Pretrained',
10
+ checkpoint= 'saved_models/pretrain_weights/sam_vit_h_4b8939_mmdet.pth'
11
+ # noqa: E501
12
+ )
13
+ detector['type'] = 'SamMasa'
14
+
15
+ del _base_.model
16
+
17
+ model = dict(
18
+ type='MASA',
19
+ freeze_detector=True,
20
+ unified_backbone=True,
21
+ load_public_dets = True,
22
+ benchmark = 'bdd',
23
+ public_det_path = 'results/public_dets/bdd_mot_yolox_dets/',
24
+ data_preprocessor=dict(
25
+ type='TrackDataPreprocessor',
26
+ # Image normalization parameters
27
+ mean=[123.675, 116.28, 103.53],
28
+ std=[58.395, 57.12, 57.375],
29
+ bgr_to_rgb=True,
30
+ # Image padding parameters
31
+ pad_mask=False, # In instance segmentation, the mask needs to be padded
32
+ pad_size_divisor=1024), # Padding the image to multiples of 32
33
+ detector=detector,
34
+ masa_adapter=[
35
+ dict(
36
+ type='SimpleFPN',
37
+ in_channels=[1280, 1280, 1280, 1280],
38
+ out_channels=256,
39
+ use_residual=True,
40
+ num_outs=5),
41
+ dict(
42
+ type='DyHead',
43
+ in_channels=256,
44
+ out_channels=256,
45
+ num_blocks=3)
46
+ ],
47
+ rpn_head=dict(
48
+ type='RPNHead',
49
+ in_channels=256,
50
+ feat_channels=256,
51
+ anchor_generator=dict(
52
+ type='AnchorGenerator',
53
+ scales=[8],
54
+ ratios=[0.5, 1.0, 2.0],
55
+ strides=[4, 8, 16, 32, 64]),
56
+ bbox_coder=dict(
57
+ type='DeltaXYWHBBoxCoder',
58
+ target_means=[.0, .0, .0, .0],
59
+ target_stds=[1.0, 1.0, 1.0, 1.0]),
60
+ loss_cls=dict(
61
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
62
+ loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)
63
+ ),
64
+ roi_head=dict(
65
+ type='StandardRoIHead',
66
+ bbox_roi_extractor=dict(
67
+ type='SingleRoIExtractor',
68
+ roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
69
+ out_channels=256,
70
+ featmap_strides=[4, 8, 16, 32]),
71
+ bbox_head=dict(
72
+ type='Shared2FCBBoxHead',
73
+ in_channels=256,
74
+ fc_out_channels=1024,
75
+ roi_feat_size=7,
76
+ num_classes=1,
77
+ bbox_coder=dict(
78
+ type='DeltaXYWHBBoxCoder',
79
+ target_means=[0., 0., 0., 0.],
80
+ target_stds=[0.1, 0.1, 0.2, 0.2]),
81
+ reg_class_agnostic=True,
82
+ loss_cls=dict(
83
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
84
+ loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
85
+ # model training and testing settings
86
+ train_cfg=dict(
87
+ rpn=dict(
88
+ assigner=dict(
89
+ type='MaxIoUAssigner',
90
+ pos_iou_thr=0.7,
91
+ neg_iou_thr=0.3,
92
+ min_pos_iou=0.3,
93
+ match_low_quality=True,
94
+ ignore_iof_thr=-1),
95
+ sampler=dict(
96
+ type='RandomSampler',
97
+ num=256,
98
+ pos_fraction=0.5,
99
+ neg_pos_ub=-1,
100
+ add_gt_as_proposals=False),
101
+ allowed_border=-1,
102
+ pos_weight=-1,
103
+ debug=False),
104
+ rpn_proposal=dict(
105
+ nms_pre=2000,
106
+ max_per_img=1000,
107
+ nms=dict(type='nms', iou_threshold=0.7),
108
+ min_bbox_size=0),
109
+ rcnn=dict(
110
+ assigner=dict(
111
+ type='MaxIoUAssigner',
112
+ pos_iou_thr=0.5,
113
+ neg_iou_thr=0.5,
114
+ min_pos_iou=0.5,
115
+ match_low_quality=False,
116
+ ignore_iof_thr=-1),
117
+ sampler=dict(
118
+ type='RandomSampler',
119
+ num=512,
120
+ pos_fraction=0.25,
121
+ neg_pos_ub=-1,
122
+ add_gt_as_proposals=True),
123
+ pos_weight=-1,
124
+ debug=False)),
125
+ test_cfg=dict(
126
+ rpn=dict(
127
+ nms_pre=1000,
128
+ max_per_img=1000,
129
+ nms=dict(type='nms', iou_threshold=0.7),
130
+ min_bbox_size=0),
131
+ rcnn=dict(
132
+ score_thr=0.02,
133
+ # nms=dict(type='nms', iou_threshold=0.5),
134
+ nms=dict(type='nms',
135
+ iou_threshold=0.5,
136
+ class_agnostic=True,
137
+ split_thr=100000),
138
+ max_per_img=50,
139
+ mask_thr_binary=0.5)
140
+ # soft-nms is also supported for rcnn testing
141
+ # e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05)
142
+ ),
143
+ track_head=dict(
144
+ type='MasaTrackHead',
145
+ roi_extractor=dict(
146
+ type='SingleRoIExtractor',
147
+ roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
148
+ out_channels=256,
149
+ featmap_strides=[4, 8, 16, 32]),
150
+ embed_head=dict(
151
+ type='QuasiDenseEmbedHead',
152
+ num_convs=4,
153
+ num_fcs=1,
154
+ embed_channels=256,
155
+ norm_cfg=dict(type='GN', num_groups=32),
156
+ loss_track=dict(type='UnbiasedContrastLoss', loss_weight=0.25),
157
+ loss_track_aux=dict(
158
+ type='MarginL2Loss',
159
+ neg_pos_ub=3,
160
+ pos_margin=0,
161
+ neg_margin=0.1,
162
+ hard_mining=True,
163
+ loss_weight=1.0)),
164
+ # loss_bbox=dict(type='L1Loss', loss_weight=1.0),
165
+ train_cfg=dict(
166
+ assigner=dict(
167
+ type='MaxIoUAssigner',
168
+ pos_iou_thr=0.7,
169
+ neg_iou_thr=0.3,
170
+ min_pos_iou=0.5,
171
+ match_low_quality=False,
172
+ ignore_iof_thr=-1),
173
+ sampler=dict(
174
+ type='CombinedSampler',
175
+ num=512,
176
+ pos_fraction=0.5,
177
+ neg_pos_ub=3,
178
+ add_gt_as_proposals=True,
179
+ pos_sampler=dict(type='InstanceBalancedPosSampler'),
180
+ neg_sampler=dict(type='RandomSampler')))),
181
+ tracker=dict(
182
+ type='MasaBDDTracker',
183
+ init_score_thr=0.5,
184
+ obj_score_thr=0.3,
185
+ match_score_thr=0.6,
186
+ memo_tracklet_frames=10,
187
+ memo_backdrop_frames=1,
188
+ memo_momentum=0.8,
189
+ nms_conf_thr=0.5,
190
+ nms_backdrop_iou_thr=0.3,
191
+ nms_class_iou_thr=0.7,
192
+ with_cats=False,
193
+ match_metric='bisoftmax')
194
+ )
195
+
196
+ test_pipeline = [
197
+ dict(
198
+ type='TransformBroadcaster',
199
+ transforms=[
200
+ dict(type='LoadImageFromFile'),
201
+ dict(
202
+ type='Resize',
203
+ scale=(1024, 1024),
204
+ keep_ratio=True),
205
+ dict(type='LoadTrackAnnotations')
206
+ ]),
207
+ dict(type='PackTrackInputs')
208
+ ]
209
+
210
+
211
+ train_dataloader = None
212
+ train_cfg = None
213
+ val_cfg = dict(type='ValLoop')
214
+ test_cfg = dict(type='TestLoop')
215
+
216
+ default_hooks = dict(
217
+ logger=dict(type='LoggerHook', interval=50),
218
+ visualization=dict(type='TrackVisualizationHook', draw=False),
219
+ checkpoint=dict(type='CheckpointHook', interval=12),
220
+ )
221
+
222
+ vis_backends = [dict(type='LocalVisBackend')]
223
+ visualizer = dict(
224
+ type='MasaTrackLocalVisualizer', vis_backends=vis_backends, name='visualizer')
225
+
226
+ # custom hooks
227
+ custom_hooks = [
228
+ # Synchronize model buffers such as running_mean and running_var in BN
229
+ # at the end of each epoch
230
+ dict(type='SyncBuffersHook')
231
+ ]
232
+ auto_scale_lr = dict(enable=False, base_batch_size=16)
233
+ val_dataloader = dict(
234
+ dataset=dict(
235
+ ann_file='data/bdd/annotations/box_track_20/box_track_val_cocofmt.json',
236
+ pipeline=test_pipeline,
237
+ )
238
+ )
239
+ test_dataloader = val_dataloader
240
+ val_evaluator = dict(
241
+ ann_file='data/bdd/annotations/box_track_20/box_track_val_cocofmt.json',
242
+ scalabel_gt='data/bdd/annotations/scalabel_gt/box_track_20/val/',
243
+ outfile_prefix='results/masa_results/masa-sam-vith-bdd-mot-test',
244
+ metric=['TETA', 'HOTA', 'CLEAR']
245
+ )
246
+ test_evaluator = val_evaluator
configs/masa-sam/bdd_test/masa_sam_vith_bdd_mots_test.py ADDED
@@ -0,0 +1,240 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _base_ = [
2
+ '../sam-vith.py',
3
+ '../../datasets/bdd/bdd_dataset.py',
4
+ '../../default_runtime.py'
5
+ ]
6
+ default_scope = 'mmdet'
7
+ detector = _base_.model
8
+ detector['init_cfg'] = dict(
9
+ type='Pretrained',
10
+ checkpoint= 'saved_models/pretrain_weights/sam_vit_h_4b8939_mmdet.pth'
11
+ # noqa: E501
12
+ )
13
+ detector['type'] = 'SamMasa'
14
+
15
+ del _base_.model
16
+
17
+ model = dict(
18
+ type='MASA',
19
+ freeze_detector=True,
20
+ unified_backbone=True,
21
+ load_public_dets = True,
22
+ with_segm=True,
23
+ benchmark = 'bdd',
24
+ public_det_path = 'results/public_dets/bdd_mots_val_uninext_dets/',
25
+ data_preprocessor=dict(
26
+ type='TrackDataPreprocessor',
27
+ # Image normalization parameters
28
+ mean=[123.675, 116.28, 103.53],
29
+ std=[58.395, 57.12, 57.375],
30
+ bgr_to_rgb=True,
31
+ # Image padding parameters
32
+ pad_mask=False, # In instance segmentation, the mask needs to be padded
33
+ pad_size_divisor=1024), # Padding the image to multiples of 32
34
+ detector=detector,
35
+ masa_adapter=[
36
+ dict(
37
+ type='SimpleFPN',
38
+ in_channels=[1280, 1280, 1280, 1280],
39
+ out_channels=256,
40
+ use_residual=True,
41
+ num_outs=5),
42
+ dict(
43
+ type='DyHead',
44
+ in_channels=256,
45
+ out_channels=256,
46
+ num_blocks=3)
47
+ ],
48
+ rpn_head=dict(
49
+ type='RPNHead',
50
+ in_channels=256,
51
+ feat_channels=256,
52
+ anchor_generator=dict(
53
+ type='AnchorGenerator',
54
+ scales=[8],
55
+ ratios=[0.5, 1.0, 2.0],
56
+ strides=[4, 8, 16, 32, 64]),
57
+ bbox_coder=dict(
58
+ type='DeltaXYWHBBoxCoder',
59
+ target_means=[.0, .0, .0, .0],
60
+ target_stds=[1.0, 1.0, 1.0, 1.0]),
61
+ loss_cls=dict(
62
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
63
+ loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)
64
+ ),
65
+ roi_head=dict(
66
+ type='StandardRoIHead',
67
+ bbox_roi_extractor=dict(
68
+ type='SingleRoIExtractor',
69
+ roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
70
+ out_channels=256,
71
+ featmap_strides=[4, 8, 16, 32]),
72
+ bbox_head=dict(
73
+ type='Shared2FCBBoxHead',
74
+ in_channels=256,
75
+ fc_out_channels=1024,
76
+ roi_feat_size=7,
77
+ num_classes=1,
78
+ bbox_coder=dict(
79
+ type='DeltaXYWHBBoxCoder',
80
+ target_means=[0., 0., 0., 0.],
81
+ target_stds=[0.1, 0.1, 0.2, 0.2]),
82
+ reg_class_agnostic=True,
83
+ loss_cls=dict(
84
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
85
+ loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
86
+ # model training and testing settings
87
+ train_cfg=dict(
88
+ rpn=dict(
89
+ assigner=dict(
90
+ type='MaxIoUAssigner',
91
+ pos_iou_thr=0.7,
92
+ neg_iou_thr=0.3,
93
+ min_pos_iou=0.3,
94
+ match_low_quality=True,
95
+ ignore_iof_thr=-1),
96
+ sampler=dict(
97
+ type='RandomSampler',
98
+ num=256,
99
+ pos_fraction=0.5,
100
+ neg_pos_ub=-1,
101
+ add_gt_as_proposals=False),
102
+ allowed_border=-1,
103
+ pos_weight=-1,
104
+ debug=False),
105
+ rpn_proposal=dict(
106
+ nms_pre=2000,
107
+ max_per_img=1000,
108
+ nms=dict(type='nms', iou_threshold=0.7),
109
+ min_bbox_size=0),
110
+ rcnn=dict(
111
+ assigner=dict(
112
+ type='MaxIoUAssigner',
113
+ pos_iou_thr=0.5,
114
+ neg_iou_thr=0.5,
115
+ min_pos_iou=0.5,
116
+ match_low_quality=False,
117
+ ignore_iof_thr=-1),
118
+ sampler=dict(
119
+ type='RandomSampler',
120
+ num=512,
121
+ pos_fraction=0.25,
122
+ neg_pos_ub=-1,
123
+ add_gt_as_proposals=True),
124
+ pos_weight=-1,
125
+ debug=False)),
126
+ test_cfg=dict(
127
+ rpn=dict(
128
+ nms_pre=1000,
129
+ max_per_img=1000,
130
+ nms=dict(type='nms', iou_threshold=0.7),
131
+ min_bbox_size=0),
132
+ rcnn=dict(
133
+ score_thr=0.02,
134
+ # nms=dict(type='nms', iou_threshold=0.5),
135
+ nms=dict(type='nms',
136
+ iou_threshold=0.5,
137
+ class_agnostic=True,
138
+ split_thr=100000),
139
+ max_per_img=50,
140
+ mask_thr_binary=0.5)
141
+ # soft-nms is also supported for rcnn testing
142
+ # e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05)
143
+ ),
144
+ track_head=dict(
145
+ type='MasaTrackHead',
146
+ roi_extractor=dict(
147
+ type='SingleRoIExtractor',
148
+ roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
149
+ out_channels=256,
150
+ featmap_strides=[4, 8, 16, 32]),
151
+ embed_head=dict(
152
+ type='QuasiDenseEmbedHead',
153
+ num_convs=4,
154
+ num_fcs=1,
155
+ embed_channels=256,
156
+ norm_cfg=dict(type='GN', num_groups=32),
157
+ loss_track=dict(type='UnbiasedContrastLoss', loss_weight=0.25),
158
+ loss_track_aux=dict(
159
+ type='MarginL2Loss',
160
+ neg_pos_ub=3,
161
+ pos_margin=0,
162
+ neg_margin=0.1,
163
+ hard_mining=True,
164
+ loss_weight=1.0)),
165
+ # loss_bbox=dict(type='L1Loss', loss_weight=1.0),
166
+ train_cfg=dict(
167
+ assigner=dict(
168
+ type='MaxIoUAssigner',
169
+ pos_iou_thr=0.7,
170
+ neg_iou_thr=0.3,
171
+ min_pos_iou=0.5,
172
+ match_low_quality=False,
173
+ ignore_iof_thr=-1),
174
+ sampler=dict(
175
+ type='CombinedSampler',
176
+ num=512,
177
+ pos_fraction=0.5,
178
+ neg_pos_ub=3,
179
+ add_gt_as_proposals=True,
180
+ pos_sampler=dict(type='InstanceBalancedPosSampler'),
181
+ neg_sampler=dict(type='RandomSampler')))),
182
+ tracker=dict(
183
+ type='MasaBDDTracker',
184
+ init_score_thr=0.5,
185
+ obj_score_thr=0.3,
186
+ match_score_thr=0.6,
187
+ memo_tracklet_frames=10,
188
+ memo_backdrop_frames=1,
189
+ memo_momentum=0.8,
190
+ nms_conf_thr=0.5,
191
+ nms_backdrop_iou_thr=0.3,
192
+ nms_class_iou_thr=0.7,
193
+ with_cats=False,
194
+ match_metric='bisoftmax')
195
+ )
196
+
197
+ test_pipeline = [
198
+ dict(
199
+ type='TransformBroadcaster',
200
+ transforms=[
201
+ dict(type='LoadImageFromFile'),
202
+ dict(
203
+ type='Resize',
204
+ scale=(1024, 1024),
205
+ keep_ratio=True),
206
+ dict(type='LoadTrackAnnotations')
207
+ ]),
208
+ dict(type='PackTrackInputs')
209
+ ]
210
+
211
+
212
+ train_dataloader = None
213
+ train_cfg = None
214
+ val_cfg = dict(type='ValLoop')
215
+ test_cfg = dict(type='TestLoop')
216
+
217
+ default_hooks = dict(
218
+ logger=dict(type='LoggerHook', interval=50),
219
+ visualization=dict(type='TrackVisualizationHook', draw=False),
220
+ checkpoint = dict(type='CheckpointHook', interval=1),
221
+ )
222
+
223
+ vis_backends = [dict(type='LocalVisBackend')]
224
+ visualizer = dict(
225
+ type='MasaTrackLocalVisualizer', vis_backends=vis_backends, name='visualizer')
226
+
227
+ val_dataloader = dict(
228
+ dataset=dict(
229
+ ann_file='data/bdd/annotations/seg_track_val_cocofmt.json',
230
+ pipeline=test_pipeline,
231
+ )
232
+ )
233
+
234
+ test_dataloader = val_dataloader
235
+ val_evaluator = dict(
236
+ outfile_prefix='results/masa_results/masa-sam-vith-bdd-mots-test',
237
+ metric=['TETA'],
238
+ with_mask=True,
239
+ )
240
+ test_evaluator = val_evaluator
configs/masa-sam/open_vocabulary_mot_test/masa_sam_vitb_open_vocabulary_test.py ADDED
@@ -0,0 +1,233 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _base_ = [
2
+ '../sam-vitb.py',
3
+ '../../datasets/tao/tao_dataset_v1.py',
4
+ '../../default_runtime.py'
5
+ ]
6
+ default_scope = 'mmdet'
7
+ detector = _base_.model
8
+ detector['init_cfg'] = dict(
9
+ type='Pretrained',
10
+ checkpoint= 'saved_models/pretrain_weights/sam_vit_b_01ec64_mmdet.pth'
11
+ # noqa: E501
12
+ )
13
+ detector['type'] = 'SamMasa'
14
+
15
+ del _base_.model
16
+
17
+ model = dict(
18
+ type='MASA',
19
+ freeze_detector=True,
20
+ unified_backbone=True,
21
+ load_public_dets = True,
22
+ benchmark = 'tao',
23
+ public_det_path = 'results/public_dets/tao_val_dets/teta_50_internms/detic_tao_val_det/',
24
+ data_preprocessor=dict(
25
+ type='TrackDataPreprocessor',
26
+ # Image normalization parameters
27
+ mean=[123.675, 116.28, 103.53],
28
+ std=[58.395, 57.12, 57.375],
29
+ bgr_to_rgb=True,
30
+ # Image padding parameters
31
+ pad_mask=False, # In instance segmentation, the mask needs to be padded
32
+ pad_size_divisor=1024), # Padding the image to multiples of 32
33
+ detector=detector,
34
+ masa_adapter=[
35
+ dict(
36
+ type='SimpleFPN',
37
+ in_channels=[768, 768, 768, 768],
38
+ out_channels=256,
39
+ use_residual=True,
40
+ num_outs=5),
41
+ dict(
42
+ type='DyHead',
43
+ in_channels=256,
44
+ out_channels=256,
45
+ num_blocks=3)
46
+ ],
47
+ rpn_head=dict(
48
+ type='RPNHead',
49
+ in_channels=256,
50
+ feat_channels=256,
51
+ anchor_generator=dict(
52
+ type='AnchorGenerator',
53
+ scales=[8],
54
+ ratios=[0.5, 1.0, 2.0],
55
+ strides=[4, 8, 16, 32, 64]),
56
+ bbox_coder=dict(
57
+ type='DeltaXYWHBBoxCoder',
58
+ target_means=[.0, .0, .0, .0],
59
+ target_stds=[1.0, 1.0, 1.0, 1.0]),
60
+ loss_cls=dict(
61
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
62
+ loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)
63
+ ),
64
+ roi_head=dict(
65
+ type='StandardRoIHead',
66
+ bbox_roi_extractor=dict(
67
+ type='SingleRoIExtractor',
68
+ roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
69
+ out_channels=256,
70
+ featmap_strides=[4, 8, 16, 32]),
71
+ bbox_head=dict(
72
+ type='Shared2FCBBoxHead',
73
+ in_channels=256,
74
+ fc_out_channels=1024,
75
+ roi_feat_size=7,
76
+ num_classes=1,
77
+ bbox_coder=dict(
78
+ type='DeltaXYWHBBoxCoder',
79
+ target_means=[0., 0., 0., 0.],
80
+ target_stds=[0.1, 0.1, 0.2, 0.2]),
81
+ reg_class_agnostic=True,
82
+ loss_cls=dict(
83
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
84
+ loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
85
+ # model training and testing settings
86
+ train_cfg=dict(
87
+ rpn=dict(
88
+ assigner=dict(
89
+ type='MaxIoUAssigner',
90
+ pos_iou_thr=0.7,
91
+ neg_iou_thr=0.3,
92
+ min_pos_iou=0.3,
93
+ match_low_quality=True,
94
+ ignore_iof_thr=-1),
95
+ sampler=dict(
96
+ type='RandomSampler',
97
+ num=256,
98
+ pos_fraction=0.5,
99
+ neg_pos_ub=-1,
100
+ add_gt_as_proposals=False),
101
+ allowed_border=-1,
102
+ pos_weight=-1,
103
+ debug=False),
104
+ rpn_proposal=dict(
105
+ nms_pre=2000,
106
+ max_per_img=1000,
107
+ nms=dict(type='nms', iou_threshold=0.7),
108
+ min_bbox_size=0),
109
+ rcnn=dict(
110
+ assigner=dict(
111
+ type='MaxIoUAssigner',
112
+ pos_iou_thr=0.5,
113
+ neg_iou_thr=0.5,
114
+ min_pos_iou=0.5,
115
+ match_low_quality=False,
116
+ ignore_iof_thr=-1),
117
+ sampler=dict(
118
+ type='RandomSampler',
119
+ num=512,
120
+ pos_fraction=0.25,
121
+ neg_pos_ub=-1,
122
+ add_gt_as_proposals=True),
123
+ pos_weight=-1,
124
+ debug=False)),
125
+ test_cfg=dict(
126
+ rpn=dict(
127
+ nms_pre=1000,
128
+ max_per_img=1000,
129
+ nms=dict(type='nms', iou_threshold=0.7),
130
+ min_bbox_size=0),
131
+ rcnn=dict(
132
+ score_thr=0.02,
133
+ # nms=dict(type='nms', iou_threshold=0.5),
134
+ nms=dict(type='nms',
135
+ iou_threshold=0.5,
136
+ class_agnostic=True,
137
+ split_thr=100000),
138
+ max_per_img=50,
139
+ mask_thr_binary=0.5)
140
+ # soft-nms is also supported for rcnn testing
141
+ # e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05)
142
+ ),
143
+ track_head=dict(
144
+ type='MasaTrackHead',
145
+ roi_extractor=dict(
146
+ type='SingleRoIExtractor',
147
+ roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
148
+ out_channels=256,
149
+ featmap_strides=[4, 8, 16, 32]),
150
+ embed_head=dict(
151
+ type='QuasiDenseEmbedHead',
152
+ num_convs=4,
153
+ num_fcs=1,
154
+ embed_channels=256,
155
+ norm_cfg=dict(type='GN', num_groups=32),
156
+ loss_track=dict(type='UnbiasedContrastLoss', loss_weight=0.25),
157
+ loss_track_aux=dict(
158
+ type='MarginL2Loss',
159
+ neg_pos_ub=3,
160
+ pos_margin=0,
161
+ neg_margin=0.1,
162
+ hard_mining=True,
163
+ loss_weight=1.0)),
164
+ train_cfg=dict(
165
+ assigner=dict(
166
+ type='MaxIoUAssigner',
167
+ pos_iou_thr=0.7,
168
+ neg_iou_thr=0.3,
169
+ min_pos_iou=0.5,
170
+ match_low_quality=False,
171
+ ignore_iof_thr=-1),
172
+ sampler=dict(
173
+ type='CombinedSampler',
174
+ num=512,
175
+ pos_fraction=0.5,
176
+ neg_pos_ub=3,
177
+ add_gt_as_proposals=True,
178
+ pos_sampler=dict(type='InstanceBalancedPosSampler'),
179
+ neg_sampler=dict(type='RandomSampler')))),
180
+ tracker=dict(
181
+ type='MasaTaoTracker',
182
+ init_score_thr=0.0001,
183
+ obj_score_thr=0.0001,
184
+ match_score_thr=0.5,
185
+ memo_tracklet_frames=10,
186
+ memo_momentum=0.8,
187
+ with_cats=False,
188
+ max_distance=-1,
189
+ fps=1,
190
+ )
191
+ )
192
+
193
+ test_pipeline = [
194
+ dict(
195
+ type='TransformBroadcaster',
196
+ transforms=[
197
+ dict(type='LoadImageFromFile'),
198
+ dict(
199
+ type='Resize',
200
+ scale=(1024, 1024),
201
+ keep_ratio=True),
202
+ dict(type='LoadTrackAnnotations')
203
+ ]),
204
+ dict(type='PackTrackInputs')
205
+ ]
206
+
207
+ # runtime settings
208
+ train_dataloader = None
209
+ train_cfg = None
210
+ val_cfg = dict(type='ValLoop')
211
+ test_cfg = dict(type='TestLoop')
212
+
213
+ default_hooks = dict(
214
+ logger=dict(type='LoggerHook', interval=50),
215
+ visualization=dict(type='TrackVisualizationHook', draw=False))
216
+
217
+ vis_backends = [dict(type='LocalVisBackend')]
218
+ visualizer = dict(
219
+ type='MasaTrackLocalVisualizer', vis_backends=vis_backends, name='visualizer')
220
+
221
+ auto_scale_lr = dict(enable=False, base_batch_size=16)
222
+ val_dataloader = dict(
223
+ dataset=dict(
224
+ ann_file='data/tao/annotations/tao_val_lvis_v1_classes.json',
225
+ pipeline=test_pipeline,
226
+ )
227
+ )
228
+ test_dataloader = val_dataloader
229
+ test_evaluator = dict(
230
+ ann_file='data/tao/annotations/tao_val_lvis_v1_classes.json',
231
+ outfile_prefix='results/masa_results/masa-sam-b-release-ovmot-test',
232
+ open_vocabulary=True,
233
+ )
configs/masa-sam/open_vocabulary_mot_test/masa_sam_vith_open_vocabulary_test.py ADDED
@@ -0,0 +1,234 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _base_ = [
2
+ '../sam-vith.py',
3
+ '../../datasets/tao/tao_dataset_v1.py',
4
+ '../../default_runtime.py'
5
+ ]
6
+ default_scope = 'mmdet'
7
+ detector = _base_.model
8
+ detector['init_cfg'] = dict(
9
+ type='Pretrained',
10
+ checkpoint= 'saved_models/pretrain_weights/sam_vit_h_4b8939_mmdet.pth'
11
+ # noqa: E501
12
+ )
13
+ detector['type'] = 'SamMasa'
14
+
15
+ del _base_.model
16
+
17
+ model = dict(
18
+ type='MASA',
19
+ freeze_detector=True,
20
+ unified_backbone=True,
21
+ load_public_dets = True,
22
+ benchmark = 'tao',
23
+ public_det_path = 'results/public_dets/tao_val_dets/teta_50_internms/detic_tao_val_det/',
24
+ data_preprocessor=dict(
25
+ type='TrackDataPreprocessor',
26
+ # Image normalization parameters
27
+ mean=[123.675, 116.28, 103.53],
28
+ std=[58.395, 57.12, 57.375],
29
+ bgr_to_rgb=True,
30
+ # Image padding parameters
31
+ pad_mask=False, # In instance segmentation, the mask needs to be padded
32
+ pad_size_divisor=1024), # Padding the image to multiples of 32
33
+ detector=detector,
34
+ masa_adapter=[
35
+ dict(
36
+ type='SimpleFPN',
37
+ in_channels=[1280, 1280, 1280, 1280],
38
+ out_channels=256,
39
+ use_residual=True,
40
+ num_outs=5),
41
+ dict(
42
+ type='DyHead',
43
+ in_channels=256,
44
+ out_channels=256,
45
+ num_blocks=3)
46
+ ],
47
+ rpn_head=dict(
48
+ type='RPNHead',
49
+ in_channels=256,
50
+ feat_channels=256,
51
+ anchor_generator=dict(
52
+ type='AnchorGenerator',
53
+ scales=[8],
54
+ ratios=[0.5, 1.0, 2.0],
55
+ strides=[4, 8, 16, 32, 64]),
56
+ bbox_coder=dict(
57
+ type='DeltaXYWHBBoxCoder',
58
+ target_means=[.0, .0, .0, .0],
59
+ target_stds=[1.0, 1.0, 1.0, 1.0]),
60
+ loss_cls=dict(
61
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
62
+ loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)
63
+ ),
64
+ roi_head=dict(
65
+ type='StandardRoIHead',
66
+ bbox_roi_extractor=dict(
67
+ type='SingleRoIExtractor',
68
+ roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
69
+ out_channels=256,
70
+ featmap_strides=[4, 8, 16, 32]),
71
+ bbox_head=dict(
72
+ type='Shared2FCBBoxHead',
73
+ in_channels=256,
74
+ fc_out_channels=1024,
75
+ roi_feat_size=7,
76
+ num_classes=1,
77
+ bbox_coder=dict(
78
+ type='DeltaXYWHBBoxCoder',
79
+ target_means=[0., 0., 0., 0.],
80
+ target_stds=[0.1, 0.1, 0.2, 0.2]),
81
+ reg_class_agnostic=True,
82
+ loss_cls=dict(
83
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
84
+ loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
85
+ # model training and testing settings
86
+ train_cfg=dict(
87
+ rpn=dict(
88
+ assigner=dict(
89
+ type='MaxIoUAssigner',
90
+ pos_iou_thr=0.7,
91
+ neg_iou_thr=0.3,
92
+ min_pos_iou=0.3,
93
+ match_low_quality=True,
94
+ ignore_iof_thr=-1),
95
+ sampler=dict(
96
+ type='RandomSampler',
97
+ num=256,
98
+ pos_fraction=0.5,
99
+ neg_pos_ub=-1,
100
+ add_gt_as_proposals=False),
101
+ allowed_border=-1,
102
+ pos_weight=-1,
103
+ debug=False),
104
+ rpn_proposal=dict(
105
+ nms_pre=2000,
106
+ max_per_img=1000,
107
+ nms=dict(type='nms', iou_threshold=0.7),
108
+ min_bbox_size=0),
109
+ rcnn=dict(
110
+ assigner=dict(
111
+ type='MaxIoUAssigner',
112
+ pos_iou_thr=0.5,
113
+ neg_iou_thr=0.5,
114
+ min_pos_iou=0.5,
115
+ match_low_quality=False,
116
+ ignore_iof_thr=-1),
117
+ sampler=dict(
118
+ type='RandomSampler',
119
+ num=512,
120
+ pos_fraction=0.25,
121
+ neg_pos_ub=-1,
122
+ add_gt_as_proposals=True),
123
+ pos_weight=-1,
124
+ debug=False)),
125
+ test_cfg=dict(
126
+ rpn=dict(
127
+ nms_pre=1000,
128
+ max_per_img=1000,
129
+ nms=dict(type='nms', iou_threshold=0.7),
130
+ min_bbox_size=0),
131
+ rcnn=dict(
132
+ score_thr=0.02,
133
+ # nms=dict(type='nms', iou_threshold=0.5),
134
+ nms=dict(type='nms',
135
+ iou_threshold=0.5,
136
+ class_agnostic=True,
137
+ split_thr=100000),
138
+ max_per_img=50,
139
+ mask_thr_binary=0.5)
140
+ # soft-nms is also supported for rcnn testing
141
+ # e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05)
142
+ ),
143
+ track_head=dict(
144
+ type='MasaTrackHead',
145
+ roi_extractor=dict(
146
+ type='SingleRoIExtractor',
147
+ roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
148
+ out_channels=256,
149
+ featmap_strides=[4, 8, 16, 32]),
150
+ embed_head=dict(
151
+ type='QuasiDenseEmbedHead',
152
+ num_convs=4,
153
+ num_fcs=1,
154
+ embed_channels=256,
155
+ norm_cfg=dict(type='GN', num_groups=32),
156
+ loss_track=dict(type='UnbiasedContrastLoss', loss_weight=0.25),
157
+ loss_track_aux=dict(
158
+ type='MarginL2Loss',
159
+ neg_pos_ub=3,
160
+ pos_margin=0,
161
+ neg_margin=0.1,
162
+ hard_mining=True,
163
+ loss_weight=1.0)),
164
+ # loss_bbox=dict(type='L1Loss', loss_weight=1.0),
165
+ train_cfg=dict(
166
+ assigner=dict(
167
+ type='MaxIoUAssigner',
168
+ pos_iou_thr=0.7,
169
+ neg_iou_thr=0.5,
170
+ min_pos_iou=0.5,
171
+ match_low_quality=False,
172
+ ignore_iof_thr=-1),
173
+ sampler=dict(
174
+ type='CombinedSampler',
175
+ num=512,
176
+ pos_fraction=0.8,
177
+ neg_pos_ub=3,
178
+ add_gt_as_proposals=True,
179
+ pos_sampler=dict(type='InstanceBalancedPosSampler'),
180
+ neg_sampler=dict(type='RandomSampler')))),
181
+ tracker=dict(
182
+ type='MasaTaoTracker',
183
+ init_score_thr=0.0001,
184
+ obj_score_thr=0.0001,
185
+ match_score_thr=0.5,
186
+ memo_tracklet_frames=10,
187
+ memo_momentum=0.8,
188
+ with_cats=False,
189
+ max_distance=-1,
190
+ fps=1,
191
+ )
192
+ )
193
+
194
+ test_pipeline = [
195
+ dict(
196
+ type='TransformBroadcaster',
197
+ transforms=[
198
+ dict(type='LoadImageFromFile'),
199
+ dict(
200
+ type='Resize',
201
+ scale=(1024, 1024),
202
+ keep_ratio=True),
203
+ dict(type='LoadTrackAnnotations')
204
+ ]),
205
+ dict(type='PackTrackInputs')
206
+ ]
207
+
208
+
209
+ train_dataloader = None
210
+ train_cfg = None
211
+ val_cfg = dict(type='ValLoop')
212
+ test_cfg = dict(type='TestLoop')
213
+
214
+ default_hooks = dict(
215
+ logger=dict(type='LoggerHook', interval=50),
216
+ visualization=dict(type='TrackVisualizationHook', draw=False))
217
+
218
+ vis_backends = [dict(type='LocalVisBackend')]
219
+ visualizer = dict(
220
+ type='MasaTrackLocalVisualizer', vis_backends=vis_backends, name='visualizer')
221
+
222
+ auto_scale_lr = dict(enable=False, base_batch_size=16)
223
+ val_dataloader = dict(
224
+ dataset=dict(
225
+ ann_file='data/tao/annotations/tao_val_lvis_v1_classes.json',
226
+ pipeline=test_pipeline,
227
+ )
228
+ )
229
+ test_dataloader = val_dataloader
230
+ test_evaluator = dict(
231
+ ann_file='data/tao/annotations/tao_val_lvis_v1_classes.json',
232
+ outfile_prefix='results/masa_results/masa-sam-h-release-ovmot-test',
233
+ open_vocabulary=True,
234
+ )
configs/masa-sam/sam-vitb.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ prompt_embed_dim=256
2
+ model = dict(
3
+ type='SamMasa',
4
+ backbone=dict(
5
+ type='ImageEncoderViT',
6
+ depth=12,
7
+ embed_dim=768,
8
+ img_size=1024,
9
+ mlp_ratio=4,
10
+ num_heads=12,
11
+ patch_size=16,
12
+ qkv_bias=True,
13
+ use_rel_pos=True,
14
+ global_attn_indexes=[2, 5, 8, 11],
15
+ window_size=14,
16
+ out_chans=prompt_embed_dim,
17
+ out_indices=[2, 5, 8, 11]),
18
+ mask_decoder=dict(
19
+ type='MaskDecoder',
20
+ num_multimask_outputs=3,
21
+ transformer_dim=prompt_embed_dim,
22
+ iou_head_depth=3,
23
+ iou_head_hidden_dim=256),
24
+ prompt_encoder=dict(
25
+ type='PromptEncoder',
26
+ embed_dim=prompt_embed_dim,
27
+ image_embedding_size=(64, 64),
28
+ input_image_size=(1024, 1024),
29
+ mask_in_chans=16),
30
+ )
configs/masa-sam/sam-vith.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ prompt_embed_dim=256
2
+ model = dict(
3
+ type='SamMasa',
4
+ backbone=dict(
5
+ type='ImageEncoderViT',
6
+ depth=32,
7
+ embed_dim=1280,
8
+ img_size=1024,
9
+ mlp_ratio=4,
10
+ num_heads=16,
11
+ patch_size=16,
12
+ qkv_bias=True,
13
+ use_rel_pos=True,
14
+ global_attn_indexes=[7, 15, 23, 31],
15
+ window_size=14,
16
+ out_chans=prompt_embed_dim,
17
+ out_indices=[7, 15, 23, 31]),
18
+ mask_decoder=dict(
19
+ type='MaskDecoder',
20
+ num_multimask_outputs=3,
21
+ transformer_dim=prompt_embed_dim,
22
+ iou_head_depth=3,
23
+ iou_head_hidden_dim=256),
24
+ prompt_encoder=dict(
25
+ type='PromptEncoder',
26
+ embed_dim=prompt_embed_dim,
27
+ image_embedding_size=(64, 64),
28
+ input_image_size=(1024, 1024),
29
+ mask_in_chans=16),
30
+ )
configs/masa-sam/tao_teta_test/masa_sam_vitb_tao_test_detic_dets.py ADDED
@@ -0,0 +1,232 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _base_ = [
2
+ '../sam-vitb.py',
3
+ '../../datasets/tao/tao_dataset_v1.py',
4
+ '../../default_runtime.py'
5
+ ]
6
+ default_scope = 'mmdet'
7
+ detector = _base_.model
8
+ detector['init_cfg'] = dict(
9
+ type='Pretrained',
10
+ checkpoint= 'saved_models/pretrain_weights/sam_vit_b_01ec64_mmdet.pth'
11
+ # noqa: E501
12
+ )
13
+ detector['type'] = 'SamMasa'
14
+
15
+ del _base_.model
16
+
17
+ model = dict(
18
+ type='MASA',
19
+ freeze_detector=True,
20
+ unified_backbone=True,
21
+ load_public_dets = True,
22
+ benchmark = 'tao',
23
+ public_det_path = 'results/public_dets/tao_val_dets/teta_50_internms/detic_tao_val_det/',
24
+ data_preprocessor=dict(
25
+ type='TrackDataPreprocessor',
26
+ # Image normalization parameters
27
+ mean=[123.675, 116.28, 103.53],
28
+ std=[58.395, 57.12, 57.375],
29
+ bgr_to_rgb=True,
30
+ # Image padding parameters
31
+ pad_mask=False, # In instance segmentation, the mask needs to be padded
32
+ pad_size_divisor=1024), # Padding the image to multiples of 32
33
+ detector=detector,
34
+ masa_adapter=[
35
+ dict(
36
+ type='SimpleFPN',
37
+ in_channels=[768, 768, 768, 768],
38
+ out_channels=256,
39
+ use_residual=True,
40
+ num_outs=5),
41
+ dict(
42
+ type='DyHead',
43
+ in_channels=256,
44
+ out_channels=256,
45
+ num_blocks=3)
46
+ ],
47
+ rpn_head=dict(
48
+ type='RPNHead',
49
+ in_channels=256,
50
+ feat_channels=256,
51
+ anchor_generator=dict(
52
+ type='AnchorGenerator',
53
+ scales=[8],
54
+ ratios=[0.5, 1.0, 2.0],
55
+ strides=[4, 8, 16, 32, 64]),
56
+ bbox_coder=dict(
57
+ type='DeltaXYWHBBoxCoder',
58
+ target_means=[.0, .0, .0, .0],
59
+ target_stds=[1.0, 1.0, 1.0, 1.0]),
60
+ loss_cls=dict(
61
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
62
+ loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)
63
+ ),
64
+ roi_head=dict(
65
+ type='StandardRoIHead',
66
+ bbox_roi_extractor=dict(
67
+ type='SingleRoIExtractor',
68
+ roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
69
+ out_channels=256,
70
+ featmap_strides=[4, 8, 16, 32]),
71
+ bbox_head=dict(
72
+ type='Shared2FCBBoxHead',
73
+ in_channels=256,
74
+ fc_out_channels=1024,
75
+ roi_feat_size=7,
76
+ num_classes=1,
77
+ bbox_coder=dict(
78
+ type='DeltaXYWHBBoxCoder',
79
+ target_means=[0., 0., 0., 0.],
80
+ target_stds=[0.1, 0.1, 0.2, 0.2]),
81
+ reg_class_agnostic=True,
82
+ loss_cls=dict(
83
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
84
+ loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
85
+ # model training and testing settings
86
+ train_cfg=dict(
87
+ rpn=dict(
88
+ assigner=dict(
89
+ type='MaxIoUAssigner',
90
+ pos_iou_thr=0.7,
91
+ neg_iou_thr=0.3,
92
+ min_pos_iou=0.3,
93
+ match_low_quality=True,
94
+ ignore_iof_thr=-1),
95
+ sampler=dict(
96
+ type='RandomSampler',
97
+ num=256,
98
+ pos_fraction=0.5,
99
+ neg_pos_ub=-1,
100
+ add_gt_as_proposals=False),
101
+ allowed_border=-1,
102
+ pos_weight=-1,
103
+ debug=False),
104
+ rpn_proposal=dict(
105
+ nms_pre=2000,
106
+ max_per_img=1000,
107
+ nms=dict(type='nms', iou_threshold=0.7),
108
+ min_bbox_size=0),
109
+ rcnn=dict(
110
+ assigner=dict(
111
+ type='MaxIoUAssigner',
112
+ pos_iou_thr=0.5,
113
+ neg_iou_thr=0.5,
114
+ min_pos_iou=0.5,
115
+ match_low_quality=False,
116
+ ignore_iof_thr=-1),
117
+ sampler=dict(
118
+ type='RandomSampler',
119
+ num=512,
120
+ pos_fraction=0.25,
121
+ neg_pos_ub=-1,
122
+ add_gt_as_proposals=True),
123
+ pos_weight=-1,
124
+ debug=False)),
125
+ test_cfg=dict(
126
+ rpn=dict(
127
+ nms_pre=1000,
128
+ max_per_img=1000,
129
+ nms=dict(type='nms', iou_threshold=0.7),
130
+ min_bbox_size=0),
131
+ rcnn=dict(
132
+ score_thr=0.02,
133
+ # nms=dict(type='nms', iou_threshold=0.5),
134
+ nms=dict(type='nms',
135
+ iou_threshold=0.5,
136
+ class_agnostic=True,
137
+ split_thr=100000),
138
+ max_per_img=50,
139
+ mask_thr_binary=0.5)
140
+ # soft-nms is also supported for rcnn testing
141
+ # e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05)
142
+ ),
143
+ track_head=dict(
144
+ type='MasaTrackHead',
145
+ roi_extractor=dict(
146
+ type='SingleRoIExtractor',
147
+ roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
148
+ out_channels=256,
149
+ featmap_strides=[4, 8, 16, 32]),
150
+ embed_head=dict(
151
+ type='QuasiDenseEmbedHead',
152
+ num_convs=4,
153
+ num_fcs=1,
154
+ embed_channels=256,
155
+ norm_cfg=dict(type='GN', num_groups=32),
156
+ loss_track=dict(type='UnbiasedContrastLoss', loss_weight=0.25),
157
+ loss_track_aux=dict(
158
+ type='MarginL2Loss',
159
+ neg_pos_ub=3,
160
+ pos_margin=0,
161
+ neg_margin=0.1,
162
+ hard_mining=True,
163
+ loss_weight=1.0)),
164
+ train_cfg=dict(
165
+ assigner=dict(
166
+ type='MaxIoUAssigner',
167
+ pos_iou_thr=0.7,
168
+ neg_iou_thr=0.3,
169
+ min_pos_iou=0.5,
170
+ match_low_quality=False,
171
+ ignore_iof_thr=-1),
172
+ sampler=dict(
173
+ type='CombinedSampler',
174
+ num=512,
175
+ pos_fraction=0.5,
176
+ neg_pos_ub=3,
177
+ add_gt_as_proposals=True,
178
+ pos_sampler=dict(type='InstanceBalancedPosSampler'),
179
+ neg_sampler=dict(type='RandomSampler')))),
180
+ tracker=dict(
181
+ type='MasaTaoTracker',
182
+ init_score_thr=0.0001,
183
+ obj_score_thr=0.0001,
184
+ match_score_thr=0.5,
185
+ memo_tracklet_frames=10,
186
+ memo_momentum=0.8,
187
+ with_cats=False,
188
+ max_distance=-1,
189
+ fps=1,
190
+ )
191
+ )
192
+
193
+ test_pipeline = [
194
+ dict(
195
+ type='TransformBroadcaster',
196
+ transforms=[
197
+ dict(type='LoadImageFromFile'),
198
+ dict(
199
+ type='Resize',
200
+ scale=(1024, 1024),
201
+ keep_ratio=True),
202
+ dict(type='LoadTrackAnnotations')
203
+ ]),
204
+ dict(type='PackTrackInputs')
205
+ ]
206
+
207
+ # runtime settings
208
+ train_dataloader = None
209
+ train_cfg = None
210
+ val_cfg = dict(type='ValLoop')
211
+ test_cfg = dict(type='TestLoop')
212
+
213
+ default_hooks = dict(
214
+ logger=dict(type='LoggerHook', interval=50),
215
+ visualization=dict(type='TrackVisualizationHook', draw=False))
216
+
217
+ vis_backends = [dict(type='LocalVisBackend')]
218
+ visualizer = dict(
219
+ type='MasaTrackLocalVisualizer', vis_backends=vis_backends, name='visualizer')
220
+
221
+ auto_scale_lr = dict(enable=False, base_batch_size=16)
222
+ val_dataloader = dict(
223
+ dataset=dict(
224
+ ann_file='data/tao/annotations/tao_val_lvis_v1_classes.json',
225
+ pipeline=test_pipeline,
226
+ )
227
+ )
228
+ test_dataloader = val_dataloader
229
+ test_evaluator = dict(
230
+ ann_file='data/tao/annotations/tao_val_lvis_v1_classes.json',
231
+ outfile_prefix='results/masa_results/masa-sam-vitb-tao-test-detic-dets',
232
+ )
configs/masa-sam/tao_teta_test/masa_sam_vitb_tao_test_teter_swinT_dets.py ADDED
@@ -0,0 +1,238 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _base_ = [
2
+ '../sam-vitb.py',
3
+ '../../datasets/tao/tao_dataset_v05.py',
4
+ '../../default_runtime.py'
5
+ ]
6
+ default_scope = 'mmdet'
7
+ detector = _base_.model
8
+ detector['init_cfg'] = dict(
9
+ type='Pretrained',
10
+ checkpoint= 'saved_models/pretrain_weights/sam_vit_b_01ec64_mmdet.pth'
11
+ # noqa: E501
12
+ )
13
+ detector['type'] = 'SamMasa'
14
+
15
+ del _base_.model
16
+
17
+ model = dict(
18
+ type='MASA',
19
+ freeze_detector=True,
20
+ unified_backbone=True,
21
+ load_public_dets = True,
22
+ public_det_path = 'results/public_dets/tao_val_dets/teta_50_internms/teter_swinT_tao_val_internms_50/',
23
+ data_preprocessor=dict(
24
+ type='TrackDataPreprocessor',
25
+ # Image normalization parameters
26
+ mean=[123.675, 116.28, 103.53],
27
+ std=[58.395, 57.12, 57.375],
28
+ bgr_to_rgb=True,
29
+ # Image padding parameters
30
+ pad_mask=False, # In instance segmentation, the mask needs to be padded
31
+ pad_size_divisor=1024), # Padding the image to multiples of 32
32
+ detector=detector,
33
+ masa_adapter=[
34
+ dict(
35
+ type='SimpleFPN',
36
+ in_channels=[768, 768, 768, 768],
37
+ out_channels=256,
38
+ use_residual=True,
39
+ num_outs=5),
40
+ dict(
41
+ type='DyHead',
42
+ in_channels=256,
43
+ out_channels=256,
44
+ num_blocks=3)
45
+ ],
46
+ rpn_head=dict(
47
+ type='RPNHead',
48
+ in_channels=256,
49
+ feat_channels=256,
50
+ anchor_generator=dict(
51
+ type='AnchorGenerator',
52
+ scales=[8],
53
+ ratios=[0.5, 1.0, 2.0],
54
+ strides=[4, 8, 16, 32, 64]),
55
+ bbox_coder=dict(
56
+ type='DeltaXYWHBBoxCoder',
57
+ target_means=[.0, .0, .0, .0],
58
+ target_stds=[1.0, 1.0, 1.0, 1.0]),
59
+ loss_cls=dict(
60
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
61
+ loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)
62
+ ),
63
+ roi_head=dict(
64
+ type='StandardRoIHead',
65
+ bbox_roi_extractor=dict(
66
+ type='SingleRoIExtractor',
67
+ roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
68
+ out_channels=256,
69
+ featmap_strides=[4, 8, 16, 32]),
70
+ bbox_head=dict(
71
+ type='Shared2FCBBoxHead',
72
+ in_channels=256,
73
+ fc_out_channels=1024,
74
+ roi_feat_size=7,
75
+ num_classes=1,
76
+ bbox_coder=dict(
77
+ type='DeltaXYWHBBoxCoder',
78
+ target_means=[0., 0., 0., 0.],
79
+ target_stds=[0.1, 0.1, 0.2, 0.2]),
80
+ reg_class_agnostic=True,
81
+ loss_cls=dict(
82
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
83
+ loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
84
+ # model training and testing settings
85
+ train_cfg=dict(
86
+ rpn=dict(
87
+ assigner=dict(
88
+ type='MaxIoUAssigner',
89
+ pos_iou_thr=0.7,
90
+ neg_iou_thr=0.3,
91
+ min_pos_iou=0.3,
92
+ match_low_quality=True,
93
+ ignore_iof_thr=-1),
94
+ sampler=dict(
95
+ type='RandomSampler',
96
+ num=256,
97
+ pos_fraction=0.5,
98
+ neg_pos_ub=-1,
99
+ add_gt_as_proposals=False),
100
+ allowed_border=-1,
101
+ pos_weight=-1,
102
+ debug=False),
103
+ rpn_proposal=dict(
104
+ nms_pre=2000,
105
+ max_per_img=1000,
106
+ nms=dict(type='nms', iou_threshold=0.7),
107
+ min_bbox_size=0),
108
+ rcnn=dict(
109
+ assigner=dict(
110
+ type='MaxIoUAssigner',
111
+ pos_iou_thr=0.5,
112
+ neg_iou_thr=0.5,
113
+ min_pos_iou=0.5,
114
+ match_low_quality=False,
115
+ ignore_iof_thr=-1),
116
+ sampler=dict(
117
+ type='RandomSampler',
118
+ num=512,
119
+ pos_fraction=0.25,
120
+ neg_pos_ub=-1,
121
+ add_gt_as_proposals=True),
122
+ pos_weight=-1,
123
+ debug=False)),
124
+ test_cfg=dict(
125
+ rpn=dict(
126
+ nms_pre=1000,
127
+ max_per_img=1000,
128
+ nms=dict(type='nms', iou_threshold=0.7),
129
+ min_bbox_size=0),
130
+ rcnn=dict(
131
+ score_thr=0.02,
132
+ # nms=dict(type='nms', iou_threshold=0.5),
133
+ nms=dict(type='nms',
134
+ iou_threshold=0.5,
135
+ class_agnostic=True,
136
+ split_thr=100000),
137
+ max_per_img=50,
138
+ mask_thr_binary=0.5)
139
+ # soft-nms is also supported for rcnn testing
140
+ # e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05)
141
+ ),
142
+ track_head=dict(
143
+ type='MasaTrackHead',
144
+ roi_extractor=dict(
145
+ type='SingleRoIExtractor',
146
+ roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
147
+ out_channels=256,
148
+ featmap_strides=[4, 8, 16, 32]),
149
+ embed_head=dict(
150
+ type='QuasiDenseEmbedHead',
151
+ num_convs=4,
152
+ num_fcs=1,
153
+ embed_channels=256,
154
+ norm_cfg=dict(type='GN', num_groups=32),
155
+ loss_track=dict(type='UnbiasedContrastLoss', loss_weight=0.25),
156
+ loss_track_aux=dict(
157
+ type='MarginL2Loss',
158
+ neg_pos_ub=3,
159
+ pos_margin=0,
160
+ neg_margin=0.1,
161
+ hard_mining=True,
162
+ loss_weight=1.0)),
163
+ train_cfg=dict(
164
+ assigner=dict(
165
+ type='MaxIoUAssigner',
166
+ pos_iou_thr=0.7,
167
+ neg_iou_thr=0.3,
168
+ min_pos_iou=0.5,
169
+ match_low_quality=False,
170
+ ignore_iof_thr=-1),
171
+ sampler=dict(
172
+ type='CombinedSampler',
173
+ num=512,
174
+ pos_fraction=0.5,
175
+ neg_pos_ub=3,
176
+ add_gt_as_proposals=True,
177
+ pos_sampler=dict(type='InstanceBalancedPosSampler'),
178
+ neg_sampler=dict(type='RandomSampler')))),
179
+ tracker=dict(
180
+ type='MasaTaoTracker',
181
+ init_score_thr=0.0001,
182
+ obj_score_thr=0.0001,
183
+ match_score_thr=0.5,
184
+ memo_tracklet_frames=10,
185
+ memo_momentum=0.8,
186
+ with_cats=False,
187
+ max_distance=-1,
188
+ fps=1,
189
+ )
190
+ )
191
+
192
+ test_pipeline = [
193
+ dict(
194
+ type='TransformBroadcaster',
195
+ transforms=[
196
+ dict(type='LoadImageFromFile'),
197
+ dict(
198
+ type='Resize',
199
+ scale=(1024, 1024),
200
+ keep_ratio=True),
201
+ dict(type='LoadTrackAnnotations')
202
+ ]),
203
+ dict(type='PackTrackInputs')
204
+ ]
205
+
206
+
207
+ train_dataloader = None
208
+ train_cfg = None
209
+ val_cfg = dict(type='ValLoop')
210
+ test_cfg = dict(type='TestLoop')
211
+
212
+ default_hooks = dict(
213
+ logger=dict(type='LoggerHook', interval=50),
214
+ visualization=dict(type='TrackVisualizationHook', draw=False))
215
+
216
+ vis_backends = [dict(type='LocalVisBackend')]
217
+ visualizer = dict(
218
+ type='MasaTrackLocalVisualizer', vis_backends=vis_backends, name='visualizer')
219
+
220
+ # custom hooks
221
+ custom_hooks = [
222
+ # Synchronize model buffers such as running_mean and running_var in BN
223
+ # at the end of each epoch
224
+ dict(type='SyncBuffersHook')
225
+ ]
226
+ auto_scale_lr = dict(enable=False, base_batch_size=16)
227
+ val_dataloader = dict(
228
+ dataset=dict(
229
+ ann_file='data/tao/annotations/tao_val_lvis_v05_classes.json',
230
+ pipeline=test_pipeline,
231
+ )
232
+ )
233
+ test_dataloader = val_dataloader
234
+ val_evaluator = dict(
235
+ ann_file='data/tao/annotations/tao_val_lvis_v05_classes.json',
236
+ outfile_prefix='results/masa_results/masa-sam-vitb-tao-test-teter-swinT-dets',
237
+ )
238
+ test_evaluator = val_evaluator
configs/masa-sam/tao_teta_test/masa_sam_vith_tao_test_detic_dets.py ADDED
@@ -0,0 +1,233 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _base_ = [
2
+ '../sam-vith.py',
3
+ '../../datasets/tao/tao_dataset_v1.py',
4
+ '../../default_runtime.py'
5
+ ]
6
+ default_scope = 'mmdet'
7
+ detector = _base_.model
8
+ detector['init_cfg'] = dict(
9
+ type='Pretrained',
10
+ checkpoint= 'saved_models/pretrain_weights/sam_vit_h_4b8939_mmdet.pth'
11
+ # noqa: E501
12
+ )
13
+ detector['type'] = 'SamMasa'
14
+
15
+ del _base_.model
16
+
17
+ model = dict(
18
+ type='MASA',
19
+ freeze_detector=True,
20
+ unified_backbone=True,
21
+ load_public_dets = True,
22
+ benchmark = 'tao',
23
+ public_det_path = 'results/public_dets/tao_val_dets/teta_50_internms/detic_tao_val_det/',
24
+ data_preprocessor=dict(
25
+ type='TrackDataPreprocessor',
26
+ # Image normalization parameters
27
+ mean=[123.675, 116.28, 103.53],
28
+ std=[58.395, 57.12, 57.375],
29
+ bgr_to_rgb=True,
30
+ # Image padding parameters
31
+ pad_mask=False, # In instance segmentation, the mask needs to be padded
32
+ pad_size_divisor=1024), # Padding the image to multiples of 32
33
+ detector=detector,
34
+ masa_adapter=[
35
+ dict(
36
+ type='SimpleFPN',
37
+ in_channels=[1280, 1280, 1280, 1280],
38
+ out_channels=256,
39
+ use_residual=True,
40
+ num_outs=5),
41
+ dict(
42
+ type='DyHead',
43
+ in_channels=256,
44
+ out_channels=256,
45
+ num_blocks=3)
46
+ ],
47
+ rpn_head=dict(
48
+ type='RPNHead',
49
+ in_channels=256,
50
+ feat_channels=256,
51
+ anchor_generator=dict(
52
+ type='AnchorGenerator',
53
+ scales=[8],
54
+ ratios=[0.5, 1.0, 2.0],
55
+ strides=[4, 8, 16, 32, 64]),
56
+ bbox_coder=dict(
57
+ type='DeltaXYWHBBoxCoder',
58
+ target_means=[.0, .0, .0, .0],
59
+ target_stds=[1.0, 1.0, 1.0, 1.0]),
60
+ loss_cls=dict(
61
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
62
+ loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)
63
+ ),
64
+ roi_head=dict(
65
+ type='StandardRoIHead',
66
+ bbox_roi_extractor=dict(
67
+ type='SingleRoIExtractor',
68
+ roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
69
+ out_channels=256,
70
+ featmap_strides=[4, 8, 16, 32]),
71
+ bbox_head=dict(
72
+ type='Shared2FCBBoxHead',
73
+ in_channels=256,
74
+ fc_out_channels=1024,
75
+ roi_feat_size=7,
76
+ num_classes=1,
77
+ bbox_coder=dict(
78
+ type='DeltaXYWHBBoxCoder',
79
+ target_means=[0., 0., 0., 0.],
80
+ target_stds=[0.1, 0.1, 0.2, 0.2]),
81
+ reg_class_agnostic=True,
82
+ loss_cls=dict(
83
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
84
+ loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
85
+ # model training and testing settings
86
+ train_cfg=dict(
87
+ rpn=dict(
88
+ assigner=dict(
89
+ type='MaxIoUAssigner',
90
+ pos_iou_thr=0.7,
91
+ neg_iou_thr=0.3,
92
+ min_pos_iou=0.3,
93
+ match_low_quality=True,
94
+ ignore_iof_thr=-1),
95
+ sampler=dict(
96
+ type='RandomSampler',
97
+ num=256,
98
+ pos_fraction=0.5,
99
+ neg_pos_ub=-1,
100
+ add_gt_as_proposals=False),
101
+ allowed_border=-1,
102
+ pos_weight=-1,
103
+ debug=False),
104
+ rpn_proposal=dict(
105
+ nms_pre=2000,
106
+ max_per_img=1000,
107
+ nms=dict(type='nms', iou_threshold=0.7),
108
+ min_bbox_size=0),
109
+ rcnn=dict(
110
+ assigner=dict(
111
+ type='MaxIoUAssigner',
112
+ pos_iou_thr=0.5,
113
+ neg_iou_thr=0.5,
114
+ min_pos_iou=0.5,
115
+ match_low_quality=False,
116
+ ignore_iof_thr=-1),
117
+ sampler=dict(
118
+ type='RandomSampler',
119
+ num=512,
120
+ pos_fraction=0.25,
121
+ neg_pos_ub=-1,
122
+ add_gt_as_proposals=True),
123
+ pos_weight=-1,
124
+ debug=False)),
125
+ test_cfg=dict(
126
+ rpn=dict(
127
+ nms_pre=1000,
128
+ max_per_img=1000,
129
+ nms=dict(type='nms', iou_threshold=0.7),
130
+ min_bbox_size=0),
131
+ rcnn=dict(
132
+ score_thr=0.02,
133
+ # nms=dict(type='nms', iou_threshold=0.5),
134
+ nms=dict(type='nms',
135
+ iou_threshold=0.5,
136
+ class_agnostic=True,
137
+ split_thr=100000),
138
+ max_per_img=50,
139
+ mask_thr_binary=0.5)
140
+ # soft-nms is also supported for rcnn testing
141
+ # e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05)
142
+ ),
143
+ track_head=dict(
144
+ type='MasaTrackHead',
145
+ roi_extractor=dict(
146
+ type='SingleRoIExtractor',
147
+ roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
148
+ out_channels=256,
149
+ featmap_strides=[4, 8, 16, 32]),
150
+ embed_head=dict(
151
+ type='QuasiDenseEmbedHead',
152
+ num_convs=4,
153
+ num_fcs=1,
154
+ embed_channels=256,
155
+ norm_cfg=dict(type='GN', num_groups=32),
156
+ loss_track=dict(type='UnbiasedContrastLoss', loss_weight=0.25),
157
+ loss_track_aux=dict(
158
+ type='MarginL2Loss',
159
+ neg_pos_ub=3,
160
+ pos_margin=0,
161
+ neg_margin=0.1,
162
+ hard_mining=True,
163
+ loss_weight=1.0)),
164
+ # loss_bbox=dict(type='L1Loss', loss_weight=1.0),
165
+ train_cfg=dict(
166
+ assigner=dict(
167
+ type='MaxIoUAssigner',
168
+ pos_iou_thr=0.7,
169
+ neg_iou_thr=0.5,
170
+ min_pos_iou=0.5,
171
+ match_low_quality=False,
172
+ ignore_iof_thr=-1),
173
+ sampler=dict(
174
+ type='CombinedSampler',
175
+ num=512,
176
+ pos_fraction=0.8,
177
+ neg_pos_ub=3,
178
+ add_gt_as_proposals=True,
179
+ pos_sampler=dict(type='InstanceBalancedPosSampler'),
180
+ neg_sampler=dict(type='RandomSampler')))),
181
+ tracker=dict(
182
+ type='MasaTaoTracker',
183
+ init_score_thr=0.0001,
184
+ obj_score_thr=0.0001,
185
+ match_score_thr=0.5,
186
+ memo_tracklet_frames=10,
187
+ memo_momentum=0.8,
188
+ with_cats=False,
189
+ max_distance=-1,
190
+ fps=1,
191
+ )
192
+ )
193
+
194
+ test_pipeline = [
195
+ dict(
196
+ type='TransformBroadcaster',
197
+ transforms=[
198
+ dict(type='LoadImageFromFile'),
199
+ dict(
200
+ type='Resize',
201
+ scale=(1024, 1024),
202
+ keep_ratio=True),
203
+ dict(type='LoadTrackAnnotations')
204
+ ]),
205
+ dict(type='PackTrackInputs')
206
+ ]
207
+
208
+
209
+ train_dataloader = None
210
+ train_cfg = None
211
+ val_cfg = dict(type='ValLoop')
212
+ test_cfg = dict(type='TestLoop')
213
+
214
+ default_hooks = dict(
215
+ logger=dict(type='LoggerHook', interval=50),
216
+ visualization=dict(type='TrackVisualizationHook', draw=False))
217
+
218
+ vis_backends = [dict(type='LocalVisBackend')]
219
+ visualizer = dict(
220
+ type='MasaTrackLocalVisualizer', vis_backends=vis_backends, name='visualizer')
221
+
222
+ auto_scale_lr = dict(enable=False, base_batch_size=16)
223
+ val_dataloader = dict(
224
+ dataset=dict(
225
+ ann_file='data/tao/annotations/tao_val_lvis_v1_classes.json',
226
+ pipeline=test_pipeline,
227
+ )
228
+ )
229
+ test_dataloader = val_dataloader
230
+ test_evaluator = dict(
231
+ ann_file='data/tao/annotations/tao_val_lvis_v1_classes.json',
232
+ outfile_prefix='results/masa_results/masa-sam-vith-tao-test-detic-dets',
233
+ )
configs/masa-sam/tao_teta_test/masa_sam_vith_tao_test_teter_swinT_dets.py ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _base_ = [
2
+ '../sam-vith.py',
3
+ '../../datasets/tao/tao_dataset_v05.py',
4
+ '../../default_runtime.py'
5
+ ]
6
+ default_scope = 'mmdet'
7
+ detector = _base_.model
8
+ detector['init_cfg'] = dict(
9
+ type='Pretrained',
10
+ checkpoint= 'saved_models/pretrain_weights/sam_vit_h_4b8939_mmdet.pth'
11
+ # noqa: E501
12
+ )
13
+ detector['type'] = 'SamMasa'
14
+
15
+ del _base_.model
16
+
17
+ model = dict(
18
+ type='MASA',
19
+ freeze_detector=True,
20
+ unified_backbone=True,
21
+ load_public_dets = True,
22
+ public_det_path = 'results/public_dets/tao_val_dets/teta_50_internms/teter_swinT_tao_val_internms_50/',
23
+ data_preprocessor=dict(
24
+ type='TrackDataPreprocessor',
25
+ # Image normalization parameters
26
+ mean=[123.675, 116.28, 103.53],
27
+ std=[58.395, 57.12, 57.375],
28
+ bgr_to_rgb=True,
29
+ # Image padding parameters
30
+ pad_mask=False, # In instance segmentation, the mask needs to be padded
31
+ pad_size_divisor=1024), # Padding the image to multiples of 32
32
+ detector=detector,
33
+ masa_adapter=[
34
+ dict(
35
+ type='SimpleFPN',
36
+ in_channels=[1280, 1280, 1280, 1280],
37
+ out_channels=256,
38
+ use_residual=True,
39
+ num_outs=5),
40
+ dict(
41
+ type='DyHead',
42
+ in_channels=256,
43
+ out_channels=256,
44
+ num_blocks=3)
45
+ ],
46
+ rpn_head=dict(
47
+ type='RPNHead',
48
+ in_channels=256,
49
+ feat_channels=256,
50
+ anchor_generator=dict(
51
+ type='AnchorGenerator',
52
+ scales=[8],
53
+ ratios=[0.5, 1.0, 2.0],
54
+ strides=[4, 8, 16, 32, 64]),
55
+ bbox_coder=dict(
56
+ type='DeltaXYWHBBoxCoder',
57
+ target_means=[.0, .0, .0, .0],
58
+ target_stds=[1.0, 1.0, 1.0, 1.0]),
59
+ loss_cls=dict(
60
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
61
+ loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)
62
+ ),
63
+ roi_head=dict(
64
+ type='StandardRoIHead',
65
+ bbox_roi_extractor=dict(
66
+ type='SingleRoIExtractor',
67
+ roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
68
+ out_channels=256,
69
+ featmap_strides=[4, 8, 16, 32]),
70
+ bbox_head=dict(
71
+ type='Shared2FCBBoxHead',
72
+ in_channels=256,
73
+ fc_out_channels=1024,
74
+ roi_feat_size=7,
75
+ num_classes=1,
76
+ bbox_coder=dict(
77
+ type='DeltaXYWHBBoxCoder',
78
+ target_means=[0., 0., 0., 0.],
79
+ target_stds=[0.1, 0.1, 0.2, 0.2]),
80
+ reg_class_agnostic=True,
81
+ loss_cls=dict(
82
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
83
+ loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
84
+ # model training and testing settings
85
+ train_cfg=dict(
86
+ rpn=dict(
87
+ assigner=dict(
88
+ type='MaxIoUAssigner',
89
+ pos_iou_thr=0.7,
90
+ neg_iou_thr=0.3,
91
+ min_pos_iou=0.3,
92
+ match_low_quality=True,
93
+ ignore_iof_thr=-1),
94
+ sampler=dict(
95
+ type='RandomSampler',
96
+ num=256,
97
+ pos_fraction=0.5,
98
+ neg_pos_ub=-1,
99
+ add_gt_as_proposals=False),
100
+ allowed_border=-1,
101
+ pos_weight=-1,
102
+ debug=False),
103
+ rpn_proposal=dict(
104
+ nms_pre=2000,
105
+ max_per_img=1000,
106
+ nms=dict(type='nms', iou_threshold=0.7),
107
+ min_bbox_size=0),
108
+ rcnn=dict(
109
+ assigner=dict(
110
+ type='MaxIoUAssigner',
111
+ pos_iou_thr=0.5,
112
+ neg_iou_thr=0.5,
113
+ min_pos_iou=0.5,
114
+ match_low_quality=False,
115
+ ignore_iof_thr=-1),
116
+ sampler=dict(
117
+ type='RandomSampler',
118
+ num=512,
119
+ pos_fraction=0.25,
120
+ neg_pos_ub=-1,
121
+ add_gt_as_proposals=True),
122
+ pos_weight=-1,
123
+ debug=False)),
124
+ test_cfg=dict(
125
+ rpn=dict(
126
+ nms_pre=1000,
127
+ max_per_img=1000,
128
+ nms=dict(type='nms', iou_threshold=0.7),
129
+ min_bbox_size=0),
130
+ rcnn=dict(
131
+ score_thr=0.02,
132
+ # nms=dict(type='nms', iou_threshold=0.5),
133
+ nms=dict(type='nms',
134
+ iou_threshold=0.5,
135
+ class_agnostic=True,
136
+ split_thr=100000),
137
+ max_per_img=50,
138
+ mask_thr_binary=0.5)
139
+ # soft-nms is also supported for rcnn testing
140
+ # e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05)
141
+ ),
142
+ track_head=dict(
143
+ type='MasaTrackHead',
144
+ roi_extractor=dict(
145
+ type='SingleRoIExtractor',
146
+ roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
147
+ out_channels=256,
148
+ featmap_strides=[4, 8, 16, 32]),
149
+ embed_head=dict(
150
+ type='QuasiDenseEmbedHead',
151
+ num_convs=4,
152
+ num_fcs=1,
153
+ embed_channels=256,
154
+ norm_cfg=dict(type='GN', num_groups=32),
155
+ loss_track=dict(type='UnbiasedContrastLoss', loss_weight=0.25),
156
+ loss_track_aux=dict(
157
+ type='MarginL2Loss',
158
+ neg_pos_ub=3,
159
+ pos_margin=0,
160
+ neg_margin=0.1,
161
+ hard_mining=True,
162
+ loss_weight=1.0)),
163
+ # loss_bbox=dict(type='L1Loss', loss_weight=1.0),
164
+ train_cfg=dict(
165
+ assigner=dict(
166
+ type='MaxIoUAssigner',
167
+ pos_iou_thr=0.7,
168
+ neg_iou_thr=0.5,
169
+ min_pos_iou=0.5,
170
+ match_low_quality=False,
171
+ ignore_iof_thr=-1),
172
+ sampler=dict(
173
+ type='CombinedSampler',
174
+ num=512,
175
+ pos_fraction=0.8,
176
+ neg_pos_ub=3,
177
+ add_gt_as_proposals=True,
178
+ pos_sampler=dict(type='InstanceBalancedPosSampler'),
179
+ neg_sampler=dict(type='RandomSampler')))),
180
+ tracker=dict(
181
+ type='MasaTaoTracker',
182
+ init_score_thr=0.0001,
183
+ obj_score_thr=0.0001,
184
+ match_score_thr=0.5,
185
+ memo_tracklet_frames=10,
186
+ memo_momentum=0.8,
187
+ with_cats=False,
188
+ max_distance=-1,
189
+ fps=1,
190
+ )
191
+ )
192
+
193
+ test_pipeline = [
194
+ dict(
195
+ type='TransformBroadcaster',
196
+ transforms=[
197
+ dict(type='LoadImageFromFile'),
198
+ dict(
199
+ type='Resize',
200
+ scale=(1024, 1024),
201
+ keep_ratio=True),
202
+ dict(type='LoadTrackAnnotations')
203
+ ]),
204
+ dict(type='PackTrackInputs')
205
+ ]
206
+
207
+
208
+ train_dataloader = None
209
+ train_cfg = None
210
+ val_cfg = dict(type='ValLoop')
211
+ test_cfg = dict(type='TestLoop')
212
+
213
+ default_hooks = dict(
214
+ logger=dict(type='LoggerHook', interval=50),
215
+ visualization=dict(type='TrackVisualizationHook', draw=False))
216
+
217
+ vis_backends = [dict(type='LocalVisBackend')]
218
+ visualizer = dict(
219
+ type='MasaTrackLocalVisualizer', vis_backends=vis_backends, name='visualizer')
220
+
221
+ # custom hooks
222
+ custom_hooks = [
223
+ # Synchronize model buffers such as running_mean and running_var in BN
224
+ # at the end of each epoch
225
+ dict(type='SyncBuffersHook')
226
+ ]
227
+ auto_scale_lr = dict(enable=False, base_batch_size=16)
228
+ val_dataloader = dict(
229
+ dataset=dict(
230
+ ann_file='data/tao/annotations/tao_val_lvis_v05_classes.json',
231
+ pipeline=test_pipeline,
232
+ )
233
+ )
234
+ test_dataloader = val_dataloader
235
+ val_evaluator = dict(
236
+ ann_file='data/tao/annotations/tao_val_lvis_v05_classes.json',
237
+ outfile_prefix='results/masa_results/masa-sam-vith-tao-test-teter-swinT-dets',
238
+ )
239
+ test_evaluator = val_evaluator
environment_docker.yml ADDED
@@ -0,0 +1,302 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: masaenv
2
+ channels:
3
+ - pytorch
4
+ - nvidia
5
+ - conda-forge
6
+ - defaults
7
+ dependencies:
8
+ - _libgcc_mutex=0.1=conda_forge
9
+ - _openmp_mutex=4.5=2_gnu
10
+ - aom=3.9.1=hac33072_0
11
+ - blas=1.0=mkl
12
+ - brotli-python=1.0.9=py311h6a678d5_8
13
+ - bzip2=1.0.8=h5eee18b_6
14
+ - ca-certificates=2024.6.2=hbcca054_0
15
+ - cairo=1.18.0=h3faef2a_0
16
+ - certifi=2024.6.2=pyhd8ed1ab_0
17
+ - charset-normalizer=2.0.4=pyhd3eb1b0_0
18
+ - cuda-cudart=11.8.89=0
19
+ - cuda-cupti=11.8.87=0
20
+ - cuda-libraries=11.8.0=0
21
+ - cuda-nvrtc=11.8.89=0
22
+ - cuda-nvtx=11.8.86=0
23
+ - cuda-runtime=11.8.0=0
24
+ - cudatoolkit=11.8.0=h6a678d5_0
25
+ - dav1d=1.2.1=hd590300_0
26
+ - expat=2.6.2=h59595ed_0
27
+ - ffmpeg=7.0.1=gpl_hb399a10_100
28
+ - font-ttf-dejavu-sans-mono=2.37=hab24e00_0
29
+ - font-ttf-inconsolata=3.000=h77eed37_0
30
+ - font-ttf-source-code-pro=2.038=h77eed37_0
31
+ - font-ttf-ubuntu=0.83=h77eed37_2
32
+ - fontconfig=2.14.2=h14ed4e7_0
33
+ - fonts-conda-ecosystem=1=0
34
+ - fonts-conda-forge=1=0
35
+ - freetype=2.12.1=h4a9f257_0
36
+ - fribidi=1.0.10=h36c2ea0_0
37
+ - gmp=6.3.0=h59595ed_1
38
+ - gmpy2=2.1.2=py311hc9b5ff0_0
39
+ - gnutls=3.7.9=hb077bed_0
40
+ - graphite2=1.3.13=h59595ed_1003
41
+ - harfbuzz=8.5.0=hfac3d4d_0
42
+ - icu=73.2=h59595ed_0
43
+ - idna=3.7=py311h06a4308_0
44
+ - intel-openmp=2023.1.0=hdb19cb5_46306
45
+ - jinja2=3.1.4=py311h06a4308_0
46
+ - jpeg=9e=h5eee18b_1
47
+ - lame=3.100=h7b6447c_0
48
+ - lcms2=2.12=h3be6417_0
49
+ - ld_impl_linux-64=2.38=h1181459_1
50
+ - lerc=3.0=h295c915_0
51
+ - libabseil=20240116.2=cxx17_h59595ed_0
52
+ - libass=0.17.1=h8fe9dca_1
53
+ - libcublas=11.11.3.6=0
54
+ - libcufft=10.9.0.58=0
55
+ - libcufile=1.9.1.3=0
56
+ - libcurand=10.3.5.147=0
57
+ - libcusolver=11.4.1.48=0
58
+ - libcusparse=11.7.5.86=0
59
+ - libdeflate=1.17=h5eee18b_1
60
+ - libdrm=2.4.120=hd590300_0
61
+ - libexpat=2.6.2=h59595ed_0
62
+ - libffi=3.4.4=h6a678d5_1
63
+ - libgcc-ng=13.2.0=h77fa898_10
64
+ - libglib=2.80.2=hf974151_0
65
+ - libgomp=13.2.0=h77fa898_10
66
+ - libhwloc=2.10.0=default_h5622ce7_1001
67
+ - libiconv=1.17=hd590300_2
68
+ - libidn2=2.3.4=h5eee18b_0
69
+ - libjpeg-turbo=2.0.0=h9bf148f_0
70
+ - libnpp=11.8.0.86=0
71
+ - libnsl=2.0.1=hd590300_0
72
+ - libnvjpeg=11.9.0.86=0
73
+ - libopenvino=2024.1.0=h2da1b83_7
74
+ - libopenvino-auto-batch-plugin=2024.1.0=hb045406_7
75
+ - libopenvino-auto-plugin=2024.1.0=hb045406_7
76
+ - libopenvino-hetero-plugin=2024.1.0=h5c03a75_7
77
+ - libopenvino-intel-cpu-plugin=2024.1.0=h2da1b83_7
78
+ - libopenvino-intel-gpu-plugin=2024.1.0=h2da1b83_7
79
+ - libopenvino-intel-npu-plugin=2024.1.0=he02047a_7
80
+ - libopenvino-ir-frontend=2024.1.0=h5c03a75_7
81
+ - libopenvino-onnx-frontend=2024.1.0=h07e8aee_7
82
+ - libopenvino-paddle-frontend=2024.1.0=h07e8aee_7
83
+ - libopenvino-pytorch-frontend=2024.1.0=he02047a_7
84
+ - libopenvino-tensorflow-frontend=2024.1.0=h39126c6_7
85
+ - libopenvino-tensorflow-lite-frontend=2024.1.0=he02047a_7
86
+ - libopus=1.3.1=h7f98852_1
87
+ - libpciaccess=0.18=hd590300_0
88
+ - libpng=1.6.39=h5eee18b_0
89
+ - libprotobuf=4.25.3=h08a7969_0
90
+ - libsqlite=3.46.0=hde9e2c9_0
91
+ - libstdcxx-ng=13.2.0=hc0a3c3a_10
92
+ - libtasn1=4.19.0=h5eee18b_0
93
+ - libtiff=4.5.1=h6a678d5_0
94
+ - libunistring=0.9.10=h27cfd23_0
95
+ - libuuid=2.38.1=h0b41bf4_0
96
+ - libva=2.21.0=h4ab18f5_2
97
+ - libvpx=1.14.1=hac33072_0
98
+ - libwebp-base=1.3.2=h5eee18b_0
99
+ - libxcb=1.15=h0b41bf4_0
100
+ - libxcrypt=4.4.36=hd590300_1
101
+ - libxml2=2.12.7=hc051c1a_1
102
+ - libzlib=1.2.13=h4ab18f5_6
103
+ - llvm-openmp=14.0.6=h9e868ea_0
104
+ - lz4-c=1.9.4=h6a678d5_1
105
+ - markupsafe=2.1.3=py311h5eee18b_0
106
+ - mkl=2023.1.0=h213fc3f_46344
107
+ - mkl-service=2.4.0=py311h5eee18b_1
108
+ - mkl_fft=1.3.8=py311h5eee18b_0
109
+ - mkl_random=1.2.4=py311hdb19cb5_0
110
+ - mpc=1.1.0=h10f8cd9_1
111
+ - mpfr=4.0.2=hb69a4c5_1
112
+ - mpmath=1.3.0=py311h06a4308_0
113
+ - ncurses=6.4=h6a678d5_0
114
+ - nettle=3.9.1=h7ab15ed_0
115
+ - networkx=3.2.1=py311h06a4308_0
116
+ - numpy=1.26.4=py311h08b1b3b_0
117
+ - numpy-base=1.26.4=py311hf175353_0
118
+ - ocl-icd=2.3.2=hd590300_1
119
+ - openh264=2.4.1=h59595ed_0
120
+ - openjpeg=2.4.0=h3ad879b_0
121
+ - openssl=3.3.1=h4ab18f5_0
122
+ - p11-kit=0.24.1=hc5aa10d_0
123
+ - pcre2=10.43=hcad00b1_0
124
+ - pillow=10.3.0=py311h5eee18b_0
125
+ - pip=24.0=py311h06a4308_0
126
+ - pixman=0.43.2=h59595ed_0
127
+ - pthread-stubs=0.4=h36c2ea0_1001
128
+ - pugixml=1.14=h59595ed_0
129
+ - pysocks=1.7.1=py311h06a4308_0
130
+ - python=3.11.8=hab00c5b_0_cpython
131
+ - pytorch=2.1.2=py3.11_cuda11.8_cudnn8.7.0_0
132
+ - pytorch-cuda=11.8=h7e8668a_5
133
+ - pytorch-mutex=1.0=cuda
134
+ - pyyaml=6.0.1=py311h5eee18b_0
135
+ - readline=8.2=h5eee18b_0
136
+ - snappy=1.2.0=hdb0a2a9_1
137
+ - sqlite=3.45.3=h5eee18b_0
138
+ - svt-av1=2.1.0=hac33072_0
139
+ - sympy=1.12=py311h06a4308_0
140
+ - tbb=2021.12.0=h297d8ca_1
141
+ - tk=8.6.14=h39e8969_0
142
+ - torchaudio=2.1.2=py311_cu118
143
+ - torchtriton=2.1.0=py311
144
+ - torchvision=0.16.2=py311_cu118
145
+ - typing_extensions=4.11.0=py311h06a4308_0
146
+ - wheel=0.43.0=py311h06a4308_0
147
+ - x264=1!164.3095=h166bdaf_2
148
+ - x265=3.5=h924138e_3
149
+ - xorg-fixesproto=5.0=h7f98852_1002
150
+ - xorg-kbproto=1.0.7=h7f98852_1002
151
+ - xorg-libice=1.1.1=hd590300_0
152
+ - xorg-libsm=1.2.4=h7391055_0
153
+ - xorg-libx11=1.8.9=h8ee46fc_0
154
+ - xorg-libxau=1.0.11=hd590300_0
155
+ - xorg-libxdmcp=1.1.3=h7f98852_0
156
+ - xorg-libxext=1.3.4=h0b41bf4_2
157
+ - xorg-libxfixes=5.0.3=h7f98852_1004
158
+ - xorg-libxrender=0.9.11=hd590300_0
159
+ - xorg-renderproto=0.11.1=h7f98852_1002
160
+ - xorg-xextproto=7.3.0=h0b41bf4_1003
161
+ - xorg-xproto=7.0.31=h7f98852_1007
162
+ - xz=5.4.6=h5eee18b_1
163
+ - yaml=0.2.5=h7b6447c_0
164
+ - zlib=1.2.13=h4ab18f5_6
165
+ - zstd=1.5.5=hc292b87_2
166
+ - pip:
167
+ - addict==2.4.0
168
+ - aiofiles==23.2.1
169
+ - aliyun-python-sdk-core==2.15.1
170
+ - aliyun-python-sdk-kms==2.16.3
171
+ - altair==5.3.0
172
+ - annotated-types==0.7.0
173
+ - anyio==4.4.0
174
+ - attrs==23.2.0
175
+ - boto3==1.34.128
176
+ - botocore==1.34.128
177
+ - cffi==1.16.0
178
+ - click==8.1.7
179
+ - clip==1.0
180
+ - colorama==0.4.6
181
+ - contourpy==1.2.1
182
+ - crcmod==1.7
183
+ - cryptography==42.0.8
184
+ - cycler==0.12.1
185
+ - cython==3.0.10
186
+ - decorator==4.4.2
187
+ - defusedxml==0.7.1
188
+ - dnspython==2.6.1
189
+ - einops==0.8.0
190
+ - email-validator==2.1.2
191
+ - fairscale==0.4.13
192
+ - fastapi==0.111.0
193
+ - fastapi-cli==0.0.4
194
+ - ffmpy==0.3.2
195
+ - filelock==3.14.0
196
+ - fonttools==4.53.0
197
+ - fsspec==2024.6.0
198
+ - ftfy==6.2.0
199
+ - gradio==4.36.1
200
+ - gradio-client==1.0.1
201
+ - h11==0.14.0
202
+ - h5py==3.11.0
203
+ - httpcore==1.0.5
204
+ - httptools==0.6.1
205
+ - httpx==0.27.0
206
+ - huggingface-hub==0.23.4
207
+ - imageio==2.34.1
208
+ - importlib-metadata==7.1.0
209
+ - importlib-resources==6.4.0
210
+ - jmespath==0.10.0
211
+ - joblib==1.4.2
212
+ - jsonschema==4.22.0
213
+ - jsonschema-specifications==2023.12.1
214
+ - kiwisolver==1.4.5
215
+ - llvmlite==0.43.0
216
+ - lvis==0.5.3
217
+ - markdown==3.6
218
+ - markdown-it-py==3.0.0
219
+ - matplotlib==3.9.0
220
+ - mdurl==0.1.2
221
+ - mmcv==2.1.0
222
+ - mmdet==3.3.0
223
+ - mmengine==0.10.4
224
+ - model-index==0.1.11
225
+ - motmetrics==1.4.0
226
+ - moviepy==0.2.3.5
227
+ - nanoid==2.0.0
228
+ - natsort==8.4.0
229
+ - nltk==3.8.1
230
+ - numba==0.60.0
231
+ - opencv-python==4.10.0.84
232
+ - opencv-python-headless==4.10.0.84
233
+ - opendatalab==0.0.10
234
+ - openmim==0.3.9
235
+ - openxlab==0.1.0
236
+ - ordered-set==4.1.0
237
+ - orjson==3.10.5
238
+ - oss2==2.17.0
239
+ - packaging==24.1
240
+ - pandas==2.2.2
241
+ - platformdirs==4.2.2
242
+ - plyfile==1.0.3
243
+ - psutil==5.9.8
244
+ - pycocotools==2.0.8
245
+ - pycparser==2.22
246
+ - pycryptodome==3.20.0
247
+ - pydantic==2.7.4
248
+ - pydantic-core==2.18.4
249
+ - pydub==0.25.1
250
+ - pygments==2.18.0
251
+ - pyparsing==3.1.2
252
+ - python-dateutil==2.9.0.post0
253
+ - python-dotenv==1.0.1
254
+ - python-multipart==0.0.9
255
+ - pytz==2023.4
256
+ - referencing==0.35.1
257
+ - regex==2024.5.15
258
+ - requests==2.32.3
259
+ - rich==13.4.2
260
+ - rpds-py==0.18.1
261
+ - ruff==0.4.9
262
+ - s3transfer==0.10.1
263
+ - safetensors==0.4.3
264
+ - scalabel==0.3.0
265
+ - scipy==1.13.1
266
+ - script-utils==0.0.1
267
+ - seaborn==0.13.2
268
+ - semantic-version==2.10.0
269
+ - setuptools==60.2.0
270
+ - shapely==2.0.4
271
+ - shellingham==1.5.4
272
+ - six==1.16.0
273
+ - sniffio==1.3.1
274
+ - starlette==0.37.2
275
+ - supervision==0.21.0
276
+ - tabulate==0.9.0
277
+ - tao==0.1.0
278
+ - termcolor==2.4.0
279
+ - terminaltables==3.1.10
280
+ - teta==0.1.0
281
+ - tokenizers==0.15.2
282
+ - toml==0.10.2
283
+ - tomli==2.0.1
284
+ - tomlkit==0.12.0
285
+ - toolz==0.12.1
286
+ - tqdm==4.65.2
287
+ - trackeval==1.0.dev1
288
+ - transformers==4.38.2
289
+ - typer==0.12.3
290
+ - tzdata==2024.1
291
+ - ujson==5.10.0
292
+ - urllib3==2.2.2
293
+ - uvicorn==0.30.1
294
+ - uvloop==0.19.0
295
+ - watchfiles==0.22.0
296
+ - wcwidth==0.2.13
297
+ - websockets==11.0.3
298
+ - xmltodict==0.13.0
299
+ - yacs==0.1.8
300
+ - yapf==0.40.2
301
+ - youtube-dl==2021.12.17
302
+ - zipp==3.19.2
masa/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .datasets import * # noqa
2
+ from .models import * # noqa
3
+ from .visualization import * # noqa
masa/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (267 Bytes). View file
 
masa/apis/__init__.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) OpenMMLab. All rights reserved.
2
+ from .masa_inference import (build_test_pipeline, inference_detector,
3
+ inference_masa, init_masa)
4
+
5
+ __all__ = [
6
+ "inference_masa",
7
+ "init_masa",
8
+ "inference_detector",
9
+ "build_test_pipeline",
10
+ ]
masa/apis/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (418 Bytes). View file
 
masa/apis/__pycache__/masa_inference.cpython-311.pyc ADDED
Binary file (13 kB). View file
 
masa/apis/masa_inference.py ADDED
@@ -0,0 +1,297 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ import time
3
+ import warnings
4
+ from pathlib import Path
5
+ from typing import Optional, Sequence, Union
6
+
7
+ import numpy as np
8
+ import torch
9
+ import torch.nn as nn
10
+ from mmcv.ops import RoIPool
11
+ from mmcv.transforms import Compose
12
+ from mmdet.evaluation import get_classes
13
+ from mmdet.registry import MODELS
14
+ from mmdet.structures import DetDataSample, SampleList
15
+ from mmdet.utils import ConfigType, get_test_pipeline_cfg
16
+ from mmengine.config import Config
17
+ from mmengine.dataset import default_collate
18
+ from mmengine.model.utils import revert_sync_batchnorm
19
+ from mmengine.registry import init_default_scope
20
+ from mmengine.runner import autocast, load_checkpoint
21
+
22
+ ImagesType = Union[str, np.ndarray, Sequence[str], Sequence[np.ndarray]]
23
+
24
+
25
+ def init_masa(
26
+ config: Union[str, Path, Config],
27
+ checkpoint: Optional[str] = None,
28
+ palette: str = "none",
29
+ device: str = "cuda:0",
30
+ cfg_options: Optional[dict] = None,
31
+ ) -> nn.Module:
32
+ """Initialize a unified masa detector from config file.
33
+
34
+ Args:
35
+ config (str, :obj:`Path`, or :obj:`mmengine.Config`): Config file path,
36
+ :obj:`Path`, or the config object.
37
+ checkpoint (str, optional): Checkpoint path. If left as None, the model
38
+ will not load any weights.
39
+ palette (str): Color palette used for visualization. If palette
40
+ is stored in checkpoint, use checkpoint's palette first, otherwise
41
+ use externally passed palette. Currently, supports 'coco', 'voc',
42
+ 'citys' and 'random'. Defaults to none.
43
+ device (str): The device where the anchors will be put on.
44
+ Defaults to cuda:0.
45
+ cfg_options (dict, optional): Options to override some settings in
46
+ the used config.
47
+
48
+ Returns:
49
+ nn.Module: The constructed detector.
50
+ """
51
+ if isinstance(config, (str, Path)):
52
+ config = Config.fromfile(config)
53
+ elif not isinstance(config, Config):
54
+ raise TypeError(
55
+ "config must be a filename or Config object, " f"but got {type(config)}"
56
+ )
57
+
58
+ with_backbone = config.model.get("backbone", False)
59
+ if with_backbone:
60
+ if cfg_options is not None:
61
+ config.merge_from_dict(cfg_options)
62
+ elif "init_cfg" in config.model.backbone:
63
+ config.model.backbone.init_cfg = None
64
+ else:
65
+ if cfg_options is not None:
66
+ config.merge_from_dict(cfg_options)
67
+ elif "init_cfg" in config.model.detector.backbone:
68
+ config.model.detector.backbone.init_cfg = None
69
+
70
+ scope = config.get("default_scope", "mmdet")
71
+ if scope is not None:
72
+ init_default_scope(config.get("default_scope", "mmdet"))
73
+
74
+ model = MODELS.build(config.model)
75
+ model = revert_sync_batchnorm(model)
76
+ if checkpoint is None:
77
+ warnings.simplefilter("once")
78
+ warnings.warn("checkpoint is None, use COCO classes by default.")
79
+ model.dataset_meta = {"classes": get_classes("coco")}
80
+ else:
81
+ checkpoint = load_checkpoint(model, checkpoint, map_location="cpu")
82
+ # Weights converted from elsewhere may not have meta fields.
83
+ checkpoint_meta = checkpoint.get("meta", {})
84
+
85
+ # save the dataset_meta in the model for convenience
86
+ if "dataset_meta" in checkpoint_meta:
87
+ # mmdet 3.x, all keys should be lowercase
88
+ model.dataset_meta = {
89
+ k.lower(): v for k, v in checkpoint_meta["dataset_meta"].items()
90
+ }
91
+ elif "CLASSES" in checkpoint_meta:
92
+ # < mmdet 3.x
93
+ classes = checkpoint_meta["CLASSES"]
94
+ model.dataset_meta = {"classes": classes}
95
+ else:
96
+ warnings.simplefilter("once")
97
+ warnings.warn(
98
+ "dataset_meta or class names are not saved in the "
99
+ "checkpoint's meta data, use COCO classes by default."
100
+ )
101
+ model.dataset_meta = {"classes": get_classes("coco")}
102
+
103
+ # Priority: args.palette -> config -> checkpoint
104
+ if palette != "none":
105
+ model.dataset_meta["palette"] = palette
106
+ else:
107
+ if "palette" not in model.dataset_meta:
108
+ warnings.warn(
109
+ "palette does not exist, random is used by default. "
110
+ "You can also set the palette to customize."
111
+ )
112
+ model.dataset_meta["palette"] = "random"
113
+
114
+ model.cfg = config # save the config in the model for convenience
115
+ model.to(device)
116
+ model.eval()
117
+ return model
118
+
119
+
120
+ def inference_detector(
121
+ model: nn.Module,
122
+ imgs: ImagesType,
123
+ test_pipeline: Optional[Compose] = None,
124
+ text_prompt: Optional[str] = None,
125
+ custom_entities: bool = False,
126
+ fp16: bool = False,
127
+ ) -> Union[DetDataSample, SampleList]:
128
+ """Inference image(s) with the detector.
129
+
130
+ Args:
131
+ model (nn.Module): The loaded detector.
132
+ imgs (str, ndarray, Sequence[str/ndarray]):
133
+ Either image files or loaded images.
134
+ test_pipeline (:obj:`Compose`): Test pipeline.
135
+
136
+ Returns:
137
+ :obj:`DetDataSample` or list[:obj:`DetDataSample`]:
138
+ If imgs is a list or tuple, the same length list type results
139
+ will be returned, otherwise return the detection results directly.
140
+ """
141
+
142
+ if isinstance(imgs, (list, tuple)):
143
+ is_batch = True
144
+ else:
145
+ imgs = [imgs]
146
+ is_batch = False
147
+
148
+ cfg = model.cfg
149
+
150
+ if test_pipeline is None:
151
+ cfg = cfg.copy()
152
+ test_pipeline = get_test_pipeline_cfg(cfg)
153
+ if isinstance(imgs[0], np.ndarray):
154
+ # Calling this method across libraries will result
155
+ # in module unregistered error if not prefixed with mmdet.
156
+ test_pipeline[0].type = "mmdet.LoadImageFromNDArray"
157
+
158
+ test_pipeline = Compose(test_pipeline)
159
+
160
+ if model.data_preprocessor.device.type == "cpu":
161
+ for m in model.modules():
162
+ assert not isinstance(
163
+ m, RoIPool
164
+ ), "CPU inference with RoIPool is not supported currently."
165
+
166
+ result_list = []
167
+ for i, img in enumerate(imgs):
168
+ # prepare data
169
+ if isinstance(img, np.ndarray):
170
+ # TODO: remove img_id.
171
+ data_ = dict(img=img, img_id=0)
172
+ else:
173
+ # TODO: remove img_id.
174
+ data_ = dict(img_path=img, img_id=0)
175
+
176
+ if text_prompt:
177
+ data_["text"] = text_prompt
178
+ data_["custom_entities"] = custom_entities
179
+
180
+ # build the data pipeline
181
+ data_ = test_pipeline(data_)
182
+
183
+ data_["inputs"] = [data_["inputs"]]
184
+ data_["data_samples"] = [data_["data_samples"]]
185
+
186
+ # forward the model
187
+ with torch.no_grad():
188
+ with autocast(enabled=fp16):
189
+ results = model.test_step(data_)[0]
190
+
191
+ result_list.append(results)
192
+
193
+ if not is_batch:
194
+ return result_list[0]
195
+ else:
196
+ return result_list
197
+
198
+
199
+ def inference_masa(
200
+ model: nn.Module,
201
+ img: np.ndarray,
202
+ frame_id: int,
203
+ video_len: int,
204
+ test_pipeline: Optional[Compose] = None,
205
+ text_prompt=None,
206
+ custom_entities: bool = False,
207
+ det_bboxes=None,
208
+ det_labels=None,
209
+ fp16=False,
210
+ detector_type="mmdet",
211
+ show_fps=False,
212
+ ) -> SampleList:
213
+ """Inference image(s) with the masa model.
214
+
215
+ Args:
216
+ model (nn.Module): The loaded mot model.
217
+ img (np.ndarray): Loaded image.
218
+ frame_id (int): frame id.
219
+ video_len (int): demo video length
220
+ Returns:
221
+ SampleList: The tracking data samples.
222
+ """
223
+ data = dict(
224
+ img=[img.astype(np.float32)],
225
+ # img=[img.astype(np.uint8)],
226
+ frame_id=[frame_id],
227
+ ori_shape=[img.shape[:2]],
228
+ img_id=[frame_id + 1],
229
+ ori_video_length=[video_len],
230
+ )
231
+
232
+ if text_prompt is not None:
233
+ if detector_type == "mmdet":
234
+ data["text"] = [text_prompt]
235
+ data["custom_entities"] = [custom_entities]
236
+ elif detector_type == "yolo-world":
237
+ data["texts"] = [text_prompt]
238
+ data["custom_entities"] = [custom_entities]
239
+
240
+ data = test_pipeline(data)
241
+
242
+ # forward the model
243
+ with torch.no_grad():
244
+ data = default_collate([data])
245
+ if det_bboxes is not None:
246
+ data["data_samples"][0].video_data_samples[0].det_bboxes = det_bboxes
247
+ data["data_samples"][0].video_data_samples[0].det_labels = det_labels
248
+ # measure FPS ##
249
+ if show_fps:
250
+ start = time.time()
251
+ with autocast(enabled=fp16):
252
+ result = model.test_step(data)[0]
253
+ end = time.time()
254
+ fps = 1 / (end - start)
255
+ return result, fps
256
+
257
+ else:
258
+ with autocast(enabled=fp16):
259
+ result = model.test_step(data)[0]
260
+ return result
261
+
262
+
263
+ def build_test_pipeline(
264
+ cfg: ConfigType, with_text=False, detector_type="mmdet"
265
+ ) -> ConfigType:
266
+ """Build test_pipeline for mot/vis demo. In mot/vis infer, original
267
+ test_pipeline should remove the "LoadImageFromFile" and
268
+ "LoadTrackAnnotations".
269
+
270
+ Args:
271
+ cfg (ConfigDict): The loaded config.
272
+ Returns:
273
+ ConfigType: new test_pipeline
274
+ """
275
+ # remove the "LoadImageFromFile" and "LoadTrackAnnotations" in pipeline
276
+ transform_broadcaster = cfg.inference_pipeline[0].copy()
277
+ if detector_type == "yolo-world":
278
+ kept_transform = []
279
+ for transform in transform_broadcaster["transforms"]:
280
+ if (
281
+ transform["type"] == "mmyolo.YOLOv5KeepRatioResize"
282
+ or transform["type"] == "mmyolo.LetterResize"
283
+ ):
284
+ kept_transform.append(transform)
285
+ transform_broadcaster["transforms"] = kept_transform
286
+ pack_track_inputs = cfg.test_dataloader.dataset.pipeline[-1].copy()
287
+ test_pipeline = Compose([transform_broadcaster, pack_track_inputs])
288
+ else:
289
+ for transform in transform_broadcaster["transforms"]:
290
+ if "Resize" in transform["type"]:
291
+ transform_broadcaster["transforms"] = transform
292
+ pack_track_inputs = cfg.inference_pipeline[-1].copy()
293
+ if with_text:
294
+ pack_track_inputs["meta_keys"] = ("text", "custom_entities")
295
+ test_pipeline = Compose([transform_broadcaster, pack_track_inputs])
296
+
297
+ return test_pipeline
masa/datasets/__init__.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Tencent Inc. All rights reserved.
2
+ from .bdd_masa_dataset import BDDVideoDataset
3
+ from .dataset_wrappers import SeqMultiImageMixDataset
4
+ from .evaluation import * # NOQA
5
+ from .masa_dataset import MASADataset
6
+ from .pipelines import * # NOQA
7
+ from .rsconcat_dataset import RandomSampleConcatDataset
8
+ from .tao_masa_dataset import Taov1Dataset, Taov05Dataset
9
+ from .utils import yolow_collate
10
+
11
+ __all__ = [
12
+ "yolow_collate",
13
+ "RandomSampleConcatDataset",
14
+ "MASADataset",
15
+ "SeqMultiImageMixDataset",
16
+ "Taov05Dataset",
17
+ "Taov1Dataset",
18
+ "BDDVideoDataset",
19
+ ]
masa/datasets/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (771 Bytes). View file
 
masa/datasets/__pycache__/bdd_masa_dataset.cpython-311.pyc ADDED
Binary file (4.74 kB). View file
 
masa/datasets/__pycache__/dataset_wrappers.cpython-311.pyc ADDED
Binary file (19.5 kB). View file
 
masa/datasets/__pycache__/masa_dataset.cpython-311.pyc ADDED
Binary file (12.3 kB). View file
 
masa/datasets/__pycache__/rsconcat_dataset.cpython-311.pyc ADDED
Binary file (11.1 kB). View file