diff --git a/OpenPSG/checkpoints/epoch_60.pth b/OpenPSG/checkpoints/epoch_60.pth new file mode 100644 index 0000000000000000000000000000000000000000..9ca2d6cd2ca7532a08cd84df438dbdcece5049c5 --- /dev/null +++ b/OpenPSG/checkpoints/epoch_60.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c4ddcbda74686568b7e6b8145f7f33030407e27e390c37c23206f95c51829ed +size 531751994 diff --git a/OpenPSG/configs/_base_/custom_runtime.py b/OpenPSG/configs/_base_/custom_runtime.py new file mode 100644 index 0000000000000000000000000000000000000000..6c0898bafac0870b691dcfc1467a618973646e7f --- /dev/null +++ b/OpenPSG/configs/_base_/custom_runtime.py @@ -0,0 +1,17 @@ +checkpoint_config = dict(interval=1, max_keep_ckpts=1) +# yapf:disable +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) +# yapf:enable +custom_hooks = [dict(type='NumClassCheckHook')] + +dist_params = dict(backend='nccl') +log_level = 'INFO' +load_from = None +resume_from = None + +workflow = [('train', 1), ('val', 1)] diff --git a/OpenPSG/configs/_base_/datasets/psg.py b/OpenPSG/configs/_base_/datasets/psg.py new file mode 100644 index 0000000000000000000000000000000000000000..052dcd787578900f875b7f9d43729a188a4d2aca --- /dev/null +++ b/OpenPSG/configs/_base_/datasets/psg.py @@ -0,0 +1,93 @@ +# dataset settings +dataset_type = 'PanopticSceneGraphDataset' +ann_file = './data/psg/psg.json' +coco_root = 'data/coco' + +img_norm_cfg = dict(mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='LoadPanopticSceneGraphAnnotations', + with_bbox=True, + with_rel=True, + with_mask=True, + with_seg=True, + ), + dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='SegRescale', scale_factor=1 / 4), + dict(type='SceneGraphFormatBundle'), + dict( + type='Collect', + keys=[ + 'img', + 'gt_bboxes', + 'gt_labels', + 'gt_rels', + 'gt_relmaps', + 'gt_masks', + 'gt_semantic_seg', + ], + ), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + # Since the forward process may need gt info, annos must be loaded. + dict(type='LoadPanopticSceneGraphAnnotations', + with_bbox=True, + with_rel=True), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + # NOTE: Do not change the img to DC. + dict(type='ImageToTensor', keys=['img']), + dict(type='ToTensor', keys=['gt_bboxes', 'gt_labels']), + dict( + type='ToDataContainer', + fields=(dict(key='gt_bboxes'), dict(key='gt_labels')), + ), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), + ], + ), +] +data = dict( + samples_per_gpu=2, + workers_per_gpu=2, + train=dict( + type=dataset_type, + ann_file=ann_file, + img_prefix=coco_root, + seg_prefix=coco_root, + pipeline=train_pipeline, + split='train', + all_bboxes=True, + ), + val=dict( + type=dataset_type, + ann_file=ann_file, + img_prefix=coco_root, + seg_prefix=coco_root, + pipeline=test_pipeline, + split='test', + all_bboxes=True, + ), + test=dict( + type=dataset_type, + ann_file=ann_file, + img_prefix=coco_root, + seg_prefix=coco_root, + pipeline=test_pipeline, + split='test', + all_bboxes=True, + ), +) diff --git a/OpenPSG/configs/_base_/datasets/psg_panoptic.py b/OpenPSG/configs/_base_/datasets/psg_panoptic.py new file mode 100644 index 0000000000000000000000000000000000000000..9e5ee5f27af854da81cc9b936a47d3ed7721502f --- /dev/null +++ b/OpenPSG/configs/_base_/datasets/psg_panoptic.py @@ -0,0 +1,72 @@ +# dataset settings +dataset_type = 'PanopticSceneGraphDataset' +ann_file = './data/psg/psg.json' +coco_root = './data/coco' + +img_norm_cfg = dict(mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='LoadPanopticSceneGraphAnnotations', + with_bbox=True, + with_mask=True, + with_seg=True, + ), + dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='SegRescale', scale_factor=1 / 4), + dict(type='DefaultFormatBundle'), + dict( + type='Collect', + keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg'], + ), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ], + ), +] +data = dict( + samples_per_gpu=2, + workers_per_gpu=2, + train=dict( + type=dataset_type, + ann_file=ann_file, + img_prefix=coco_root, + seg_prefix=coco_root, + pipeline=train_pipeline, + split='train', + ), + val=dict( + type=dataset_type, + ann_file=ann_file, + img_prefix=coco_root, + seg_prefix=coco_root, + pipeline=test_pipeline, + split='test', + ), + test=dict( + type=dataset_type, + ann_file=ann_file, + img_prefix=coco_root, + seg_prefix=coco_root, + pipeline=test_pipeline, + split='test', + ), +) +evaluation = dict(interval=1, metric='PQ') diff --git a/OpenPSG/configs/_base_/datasets/vg_detection.py b/OpenPSG/configs/_base_/datasets/vg_detection.py new file mode 100644 index 0000000000000000000000000000000000000000..d826ecca5ea9c9bfbaf08366b5b2a468c908363b --- /dev/null +++ b/OpenPSG/configs/_base_/datasets/vg_detection.py @@ -0,0 +1,56 @@ +# dataset settings +custom_imports = dict(imports=[ + 'openpsg.datasets', + 'openpsg.datasets.pipelines', +], + allow_failed_imports=False) + +dataset_type = 'SceneGraphDataset' +ann_file = 'data/vg/data_openpsg.json' +img_dir = 'data/vg/VG_100K' + +img_norm_cfg = dict(mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadSceneGraphAnnotations', with_bbox=True), + dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='DefaultFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ]) +] +data = dict(samples_per_gpu=2, + workers_per_gpu=2, + train=dict(type=dataset_type, + ann_file=ann_file, + img_prefix=img_dir, + pipeline=train_pipeline, + split='train'), + val=dict(type=dataset_type, + ann_file=ann_file, + img_prefix=img_dir, + pipeline=test_pipeline, + split='test'), + test=dict(type=dataset_type, + ann_file=ann_file, + img_prefix=img_dir, + pipeline=test_pipeline, + split='test')) +evaluation = dict(interval=1, metric='bbox') diff --git a/OpenPSG/configs/_base_/datasets/vg_sg.py b/OpenPSG/configs/_base_/datasets/vg_sg.py new file mode 100644 index 0000000000000000000000000000000000000000..5f555ac70bc04c85cbeb9099fd792114ee2ed9a9 --- /dev/null +++ b/OpenPSG/configs/_base_/datasets/vg_sg.py @@ -0,0 +1,57 @@ +# dataset settings +dataset_type = 'SceneGraphDataset' +ann_file = '/mnt/ssd/gzj/data/VisualGenome/data_openpsg.json' +img_dir = '/mnt/ssd/gzj/data/VisualGenome/VG_100K' + +img_norm_cfg = dict(mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + to_rgb=True) +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadSceneGraphAnnotations', with_bbox=True, with_rel=True), + dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='SceneGraphFormatBundle'), + dict(type='Collect', + keys=['img', 'gt_bboxes', 'gt_labels', 'gt_rels', 'gt_relmaps']), +] +test_pipeline = [ + dict(type='LoadImageFromFile'), + # Since the forward process may need gt info, annos must be loaded. + dict(type='LoadSceneGraphAnnotations', with_bbox=True, with_rel=True), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + # NOTE: Do not change the img to DC. + dict(type='ImageToTensor', keys=['img']), + dict(type='ToTensor', keys=['gt_bboxes', 'gt_labels']), + dict(type='ToDataContainer', + fields=(dict(key='gt_bboxes'), dict(key='gt_labels'))), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), + ]) +] +data = dict(samples_per_gpu=2, + workers_per_gpu=2, + train=dict(type=dataset_type, + ann_file=ann_file, + img_prefix=img_dir, + pipeline=train_pipeline, + split='train'), + val=dict(type=dataset_type, + ann_file=ann_file, + img_prefix=img_dir, + pipeline=test_pipeline, + split='test'), + test=dict(type=dataset_type, + ann_file=ann_file, + img_prefix=img_dir, + pipeline=test_pipeline, + split='test')) diff --git a/OpenPSG/configs/_base_/models/detr4seg_r101.py b/OpenPSG/configs/_base_/models/detr4seg_r101.py new file mode 100644 index 0000000000000000000000000000000000000000..7c366f686fe6b2467ec29613cb9f95a229d038cc --- /dev/null +++ b/OpenPSG/configs/_base_/models/detr4seg_r101.py @@ -0,0 +1,64 @@ +model = dict( + type='DETR4seg', + backbone=dict(type='ResNet', + depth=101, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet101')), + bbox_head=dict(type='detr4segHead', + num_classes=80, + in_channels=2048, + transformer=dict( + type='Transformer', + encoder=dict(type='DetrTransformerEncoder', + num_layers=6, + transformerlayers=dict( + type='BaseTransformerLayer', + attn_cfgs=[ + dict(type='MultiheadAttention', + embed_dims=256, + num_heads=8, + dropout=0.1) + ], + feedforward_channels=2048, + ffn_dropout=0.1, + operation_order=('self_attn', 'norm', + 'ffn', 'norm'))), + decoder=dict( + type='DetrTransformerDecoder', + return_intermediate=True, + num_layers=6, + transformerlayers=dict( + type='DetrTransformerDecoderLayer', + attn_cfgs=dict(type='MultiheadAttention', + embed_dims=256, + num_heads=8, + dropout=0.1), + feedforward_channels=2048, + ffn_dropout=0.1, + operation_order=('self_attn', 'norm', + 'cross_attn', 'norm', 'ffn', + 'norm')), + )), + positional_encoding=dict(type='SinePositionalEncoding', + num_feats=128, + normalize=True), + loss_cls=dict(type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0, + class_weight=1.0), + loss_bbox=dict(type='L1Loss', loss_weight=5.0), + loss_iou=dict(type='GIoULoss', loss_weight=2.0), + dice_loss=dict(type='DiceLoss', loss_weight=1.0)), + # training and testing settings + train_cfg=dict(assigner=dict( + type='HungarianAssigner', + cls_cost=dict(type='ClassificationCost', weight=1.), + reg_cost=dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'), + iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0))), + test_cfg=dict(max_per_img=100)) diff --git a/OpenPSG/configs/_base_/models/detr4seg_r101_psg.py b/OpenPSG/configs/_base_/models/detr4seg_r101_psg.py new file mode 100644 index 0000000000000000000000000000000000000000..1d21e75bc4fd8b693daeaa488a613feb052914fe --- /dev/null +++ b/OpenPSG/configs/_base_/models/detr4seg_r101_psg.py @@ -0,0 +1,137 @@ +_base_ = [ + '../_base_/models/detr4seg_r101.py', '../_base_/datasets/psg.py', + '../_base_/custom_runtime.py' +] + +custom_imports = dict(imports=[ + 'openpsg.models.frameworks.detr4seg', + 'openpsg.models.relation_heads.detr4seg_head', 'openpsg.datasets', + 'openpsg.datasets.pipelines.loading', + 'openpsg.datasets.pipelines.rel_randomcrop', + 'openpsg.models.relation_heads.approaches.matcher', + 'openpsg.models.losses.seg_losses' +], + allow_failed_imports=False) + +object_classes = [ + 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', + 'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign', + 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', + 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', + 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', + 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', + 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', + 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', + 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant', + 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', + 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', + 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear', + 'hair drier', 'toothbrush', 'banner', 'blanket', 'bridge', 'cardboard', + 'counter', 'curtain', 'door-stuff', 'floor-wood', 'flower', 'fruit', + 'gravel', 'house', 'light', 'mirror-stuff', 'net', 'pillow', 'platform', + 'playingfield', 'railroad', 'river', 'road', 'roof', 'sand', 'sea', + 'shelf', 'snow', 'stairs', 'tent', 'towel', 'wall-brick', 'wall-stone', + 'wall-tile', 'wall-wood', 'water-other', 'window-blind', 'window-other', + 'tree-merged', 'fence-merged', 'ceiling-merged', 'sky-other-merged', + 'cabinet-merged', 'table-merged', 'floor-other-merged', 'pavement-merged', + 'mountain-merged', 'grass-merged', 'dirt-merged', 'paper-merged', + 'food-other-merged', 'building-other-merged', 'rock-merged', + 'wall-other-merged', 'rug-merged' +] + +model = dict(bbox_head=dict( + num_classes=len(object_classes), + object_classes=object_classes, +)) + +img_norm_cfg = dict(mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + to_rgb=True) +# train_pipeline, NOTE the img_scale and the Pad's size_divisor is different +# from the default setting in mmdet. +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadSceneGraphAnnotations', with_bbox=True, with_rel=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict( + type='AutoAugment', + policies=[ + [ + dict(type='Resize', + img_scale=[(480, 1333), (512, 1333), (544, 1333), + (576, 1333), (608, 1333), (640, 1333), + (672, 1333), (704, 1333), (736, 1333), + (768, 1333), (800, 1333)], + multiscale_mode='value', + keep_ratio=True) + ], + [ + dict(type='Resize', + img_scale=[(400, 1333), (500, 1333), (600, 1333)], + multiscale_mode='value', + keep_ratio=True), + dict(type='RandomCrop', + crop_type='absolute_range', + crop_size=(384, 600), + allow_negative_crop=False), # no empty relations + dict(type='Resize', + img_scale=[(480, 1333), (512, 1333), (544, 1333), + (576, 1333), (608, 1333), (640, 1333), + (672, 1333), (704, 1333), (736, 1333), + (768, 1333), (800, 1333)], + multiscale_mode='value', + override=True, + keep_ratio=True) + ] + ]), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=1), + dict(type='RelsFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']) +] +# test_pipeline, NOTE the Pad's size_divisor is different from the default +# setting (size_divisor=32). While there is little effect on the performance +# whether we use the default setting or use size_divisor=1. +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=1), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']) + ]) +] +data = dict(samples_per_gpu=2, + workers_per_gpu=2, + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) +# optimizer +optimizer = dict( + type='AdamW', + lr=0.0001, + weight_decay=0.0001, + paramwise_cfg=dict( + custom_keys={'backbone': dict(lr_mult=0.1, decay_mult=1.0)})) +optimizer_config = dict(grad_clip=dict(max_norm=0.1, norm_type=2)) + +# learning policy +lr_config = dict(policy='step', step=110) +runner = dict(type='EpochBasedRunner', max_epochs=150) + +project_name = 'detr4seg' +expt_name = 'detr4seg_r101_coco' +work_dir = f'./work_dirs/{expt_name}' + +log_config = dict( + interval=50, + hooks=[dict(type='TextLoggerHook'), + dict(type='TensorboardLoggerHook')], +) + +load_from = '/mnt/ssd/gzj/test/OpenPSG/detr_r50_fb_origin.pth' diff --git a/OpenPSG/configs/_base_/models/detr4seg_r50.py b/OpenPSG/configs/_base_/models/detr4seg_r50.py new file mode 100644 index 0000000000000000000000000000000000000000..326bc62336154ca94211a820406fb26025a9c544 --- /dev/null +++ b/OpenPSG/configs/_base_/models/detr4seg_r50.py @@ -0,0 +1,65 @@ +model = dict( + type='DETR4seg', + backbone=dict(type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet50')), + bbox_head=dict(type='detr4segHead', + num_classes=80, + in_channels=2048, + transformer=dict( + type='Transformer', + encoder=dict(type='DetrTransformerEncoder', + num_layers=6, + transformerlayers=dict( + type='BaseTransformerLayer', + attn_cfgs=[ + dict(type='MultiheadAttention', + embed_dims=256, + num_heads=8, + dropout=0.1) + ], + feedforward_channels=2048, + ffn_dropout=0.1, + operation_order=('self_attn', 'norm', + 'ffn', 'norm'))), + decoder=dict( + type='DetrTransformerDecoder', + return_intermediate=True, + num_layers=6, + transformerlayers=dict( + type='DetrTransformerDecoderLayer', + attn_cfgs=dict(type='MultiheadAttention', + embed_dims=256, + num_heads=8, + dropout=0.1), + feedforward_channels=2048, + ffn_dropout=0.1, + operation_order=('self_attn', 'norm', + 'cross_attn', 'norm', 'ffn', + 'norm')), + )), + positional_encoding=dict(type='SinePositionalEncoding', + num_feats=128, + normalize=True), + loss_cls=dict(type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0, + class_weight=1.0), + loss_bbox=dict(type='L1Loss', loss_weight=5.0), + loss_iou=dict(type='GIoULoss', loss_weight=2.0), + focal_loss=dict(type='BCEFocalLoss', loss_weight=1.0), + dice_loss=dict(type='psgtrDiceLoss', loss_weight=1.0)), + # training and testing settings + train_cfg=dict(assigner=dict( + type='HungarianAssigner', + cls_cost=dict(type='ClassificationCost', weight=1.), + reg_cost=dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'), + iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0))), + test_cfg=dict(max_per_img=100)) diff --git a/OpenPSG/configs/_base_/models/detr4seg_r50_psg.py b/OpenPSG/configs/_base_/models/detr4seg_r50_psg.py new file mode 100644 index 0000000000000000000000000000000000000000..07324d4942419d7879ce771a19cc8215a45fd5d2 --- /dev/null +++ b/OpenPSG/configs/_base_/models/detr4seg_r50_psg.py @@ -0,0 +1,152 @@ +_base_ = ['./detr4seg_r50.py', '../datasets/psg.py', '../custom_runtime.py'] + +custom_imports = dict(imports=[ + 'openpsg.models.frameworks.detr4seg', + 'openpsg.models.relation_heads.detr4seg_head', 'openpsg.datasets', + 'openpsg.datasets.pipelines.loading', + 'openpsg.datasets.pipelines.rel_randomcrop', + 'openpsg.models.relation_heads.approaches.matcher', + 'openpsg.models.losses.seg_losses' +], + allow_failed_imports=False) + +object_classes = [ + 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', + 'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign', + 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', + 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', + 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', + 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', + 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', + 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', + 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant', + 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', + 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', + 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear', + 'hair drier', 'toothbrush', 'banner', 'blanket', 'bridge', 'cardboard', + 'counter', 'curtain', 'door-stuff', 'floor-wood', 'flower', 'fruit', + 'gravel', 'house', 'light', 'mirror-stuff', 'net', 'pillow', 'platform', + 'playingfield', 'railroad', 'river', 'road', 'roof', 'sand', 'sea', + 'shelf', 'snow', 'stairs', 'tent', 'towel', 'wall-brick', 'wall-stone', + 'wall-tile', 'wall-wood', 'water-other', 'window-blind', 'window-other', + 'tree-merged', 'fence-merged', 'ceiling-merged', 'sky-other-merged', + 'cabinet-merged', 'table-merged', 'floor-other-merged', 'pavement-merged', + 'mountain-merged', 'grass-merged', 'dirt-merged', 'paper-merged', + 'food-other-merged', 'building-other-merged', 'rock-merged', + 'wall-other-merged', 'rug-merged' +] + +model = dict(bbox_head=dict( + num_classes=len(object_classes), + object_classes=object_classes, +)) + +img_norm_cfg = dict(mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + to_rgb=True) +# train_pipeline, NOTE the img_scale and the Pad's size_divisor is different +# from the default setting in mmdet. +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadPanopticSceneGraphAnnotations', + with_bbox=True, + with_mask=True, + with_seg=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict( + type='AutoAugment', + policies=[ + [ + dict(type='Resize', + img_scale=[(480, 1333), (512, 1333), (544, 1333), + (576, 1333), (608, 1333), (640, 1333), + (672, 1333), (704, 1333), (736, 1333), + (768, 1333), (800, 1333)], + multiscale_mode='value', + keep_ratio=True) + ], + [ + dict(type='Resize', + img_scale=[(400, 1333), (500, 1333), (600, 1333)], + multiscale_mode='value', + keep_ratio=True), + dict(type='RandomCrop', + crop_type='absolute_range', + crop_size=(384, 600), + allow_negative_crop=False), # no empty relations + dict(type='Resize', + img_scale=[(480, 1333), (512, 1333), (544, 1333), + (576, 1333), (608, 1333), (640, 1333), + (672, 1333), (704, 1333), (736, 1333), + (768, 1333), (800, 1333)], + multiscale_mode='value', + override=True, + keep_ratio=True) + ] + ]), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=1), + dict(type='RelsFormatBundle'), + dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']) +] +# test_pipeline, NOTE the Pad's size_divisor is different from the default +# setting (size_divisor=32). While there is little effect on the performance +# whether we use the default setting or use size_divisor=1. +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=1), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']) + ]) +] +data = dict(samples_per_gpu=1, + workers_per_gpu=1, + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) +# optimizer +optimizer = dict(type='AdamW', + lr=0.00001, + weight_decay=0.0001, + paramwise_cfg=dict( + custom_keys={ + 'backbone': dict(lr_mult=0.1, decay_mult=1.0), + 'bbox_attention': dict(lr_mult=10.0, decay_mult=1.0), + 'mask_head': dict(lr_mult=10.0, decay_mult=1.0) + })) +optimizer_config = dict(grad_clip=dict(max_norm=0.1, norm_type=2)) + +# learning policy +lr_config = dict(policy='step', step=8) +runner = dict(type='EpochBasedRunner', max_epochs=10) + +evaluation = dict(interval=1, metric='PQ') +checkpoint_config = dict(interval=1, max_keep_ckpts=10) + +project_name = 'detr4seg' +expt_name = 'test_detr4seg_r50_psg' +work_dir = f'./work_dirs/{expt_name}' + +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + dict(type='TensorboardLoggerHook'), + dict( + type='WandbLoggerHook', + init_kwargs=dict( + project=project_name, + name=expt_name, + # config=work_dir + "/cfg.yaml" + )) + ], +) + +load_from = 'detr_pan_r50.pth' diff --git a/OpenPSG/configs/_base_/models/detr_r50.py b/OpenPSG/configs/_base_/models/detr_r50.py new file mode 100644 index 0000000000000000000000000000000000000000..b83d7d5e108ff52eb9c2c8701697684e1fd88844 --- /dev/null +++ b/OpenPSG/configs/_base_/models/detr_r50.py @@ -0,0 +1,64 @@ +model = dict( + type='DETR', + backbone=dict(type='ResNet', + depth=50, + num_stages=4, + out_indices=(3, ), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet50')), + bbox_head=dict(type='DETRHead', + num_classes=80, + in_channels=2048, + transformer=dict( + type='Transformer', + encoder=dict(type='DetrTransformerEncoder', + num_layers=6, + transformerlayers=dict( + type='BaseTransformerLayer', + attn_cfgs=[ + dict(type='MultiheadAttention', + embed_dims=256, + num_heads=8, + dropout=0.1) + ], + feedforward_channels=2048, + ffn_dropout=0.1, + operation_order=('self_attn', 'norm', + 'ffn', 'norm'))), + decoder=dict( + type='DetrTransformerDecoder', + return_intermediate=True, + num_layers=6, + transformerlayers=dict( + type='DetrTransformerDecoderLayer', + attn_cfgs=dict(type='MultiheadAttention', + embed_dims=256, + num_heads=8, + dropout=0.1), + feedforward_channels=2048, + ffn_dropout=0.1, + operation_order=('self_attn', 'norm', + 'cross_attn', 'norm', 'ffn', + 'norm')), + )), + positional_encoding=dict(type='SinePositionalEncoding', + num_feats=128, + normalize=True), + loss_cls=dict(type='CrossEntropyLoss', + bg_cls_weight=0.1, + use_sigmoid=False, + loss_weight=1.0, + class_weight=1.0), + loss_bbox=dict(type='L1Loss', loss_weight=5.0), + loss_iou=dict(type='GIoULoss', loss_weight=2.0)), + # training and testing settings + train_cfg=dict(assigner=dict( + type='HungarianAssigner', + cls_cost=dict(type='ClassificationCost', weight=1.), + reg_cost=dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'), + iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0))), + test_cfg=dict(max_per_img=100)) diff --git a/OpenPSG/configs/_base_/models/mask_rcnn_r50_fpn.py b/OpenPSG/configs/_base_/models/mask_rcnn_r50_fpn.py new file mode 100644 index 0000000000000000000000000000000000000000..19b9c3d29c7af8ac828c25a1b388248aa23a2d77 --- /dev/null +++ b/OpenPSG/configs/_base_/models/mask_rcnn_r50_fpn.py @@ -0,0 +1,107 @@ +# model settings +model = dict( + type='MaskRCNN', + backbone=dict(type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet50')), + neck=dict(type='FPN', + in_channels=[256, 512, 1024, 2048], + out_channels=256, + num_outs=5), + rpn_head=dict(type='RPNHead', + in_channels=256, + feat_channels=256, + anchor_generator=dict(type='AnchorGenerator', + scales=[8], + ratios=[0.5, 1.0, 2.0], + strides=[4, 8, 16, 32, 64]), + bbox_coder=dict(type='DeltaXYWHBBoxCoder', + target_means=[.0, .0, .0, .0], + target_stds=[1.0, 1.0, 1.0, 1.0]), + loss_cls=dict(type='CrossEntropyLoss', + use_sigmoid=True, + loss_weight=1.0), + loss_bbox=dict(type='L1Loss', loss_weight=1.0)), + roi_head=dict(type='StandardRoIHead', + bbox_roi_extractor=dict(type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', + output_size=7, + sampling_ratio=0), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + bbox_head=dict( + type='Shared2FCBBoxHead', + in_channels=256, + fc_out_channels=1024, + roi_feat_size=7, + num_classes=80, + bbox_coder=dict(type='DeltaXYWHBBoxCoder', + target_means=[0., 0., 0., 0.], + target_stds=[0.1, 0.1, 0.2, 0.2]), + reg_class_agnostic=False, + loss_cls=dict(type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0), + loss_bbox=dict(type='L1Loss', loss_weight=1.0)), + mask_roi_extractor=dict(type='SingleRoIExtractor', + roi_layer=dict(type='RoIAlign', + output_size=14, + sampling_ratio=0), + out_channels=256, + featmap_strides=[4, 8, 16, 32]), + mask_head=dict(type='FCNMaskHead', + num_convs=4, + in_channels=256, + conv_out_channels=256, + num_classes=80, + loss_mask=dict(type='CrossEntropyLoss', + use_mask=True, + loss_weight=1.0))), + # model training and testing settings + train_cfg=dict(rpn=dict(assigner=dict(type='MaxIoUAssigner', + pos_iou_thr=0.7, + neg_iou_thr=0.3, + min_pos_iou=0.3, + match_low_quality=True, + ignore_iof_thr=-1), + sampler=dict(type='RandomSampler', + num=256, + pos_fraction=0.5, + neg_pos_ub=-1, + add_gt_as_proposals=False), + allowed_border=-1, + pos_weight=-1, + debug=False), + rpn_proposal=dict(nms_pre=2000, + max_per_img=1000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0), + rcnn=dict(assigner=dict(type='MaxIoUAssigner', + pos_iou_thr=0.5, + neg_iou_thr=0.5, + min_pos_iou=0.5, + match_low_quality=True, + ignore_iof_thr=-1), + sampler=dict(type='RandomSampler', + num=512, + pos_fraction=0.25, + neg_pos_ub=-1, + add_gt_as_proposals=True), + mask_size=28, + pos_weight=-1, + debug=False)), + test_cfg=dict(rpn=dict(nms_pre=1000, + max_per_img=1000, + nms=dict(type='nms', iou_threshold=0.7), + min_bbox_size=0), + rcnn=dict(score_thr=0.05, + nms=dict(type='nms', iou_threshold=0.5), + max_per_img=100, + mask_thr_binary=0.5))) diff --git a/OpenPSG/configs/_base_/models/panoptic_fpn_r101_fpn_psg.py b/OpenPSG/configs/_base_/models/panoptic_fpn_r101_fpn_psg.py new file mode 100644 index 0000000000000000000000000000000000000000..449ec6c9ff81c8447bc74029fad68d1bb3dc9598 --- /dev/null +++ b/OpenPSG/configs/_base_/models/panoptic_fpn_r101_fpn_psg.py @@ -0,0 +1,8 @@ +_base_ = './panoptic_fpn_r50_fpn_psg.py' + +model = dict(backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101'))) + +expt_name = 'panoptic_fpn_r101_fpn_psg' +load_from = 'work_dirs/checkpoints/panoptic_fpn_r101_fpn_1x_coco_20210820_193950-ab9157a2.pth' diff --git a/OpenPSG/configs/_base_/models/panoptic_fpn_r50_fpn_psg.py b/OpenPSG/configs/_base_/models/panoptic_fpn_r50_fpn_psg.py new file mode 100644 index 0000000000000000000000000000000000000000..44a01a4ea386ddb8c4264a6454da4d70ffde63fc --- /dev/null +++ b/OpenPSG/configs/_base_/models/panoptic_fpn_r50_fpn_psg.py @@ -0,0 +1,74 @@ +_base_ = [ + '../models/mask_rcnn_r50_fpn.py', + '../datasets/psg_panoptic.py', + '../schedules/schedule_1x.py', + '../custom_runtime.py', +] + +model = dict( + type='PanopticFPN', + semantic_head=dict( + type='PanopticFPNHead', + num_things_classes=80, + num_stuff_classes=53, + in_channels=256, + inner_channels=128, + start_level=0, + end_level=4, + norm_cfg=dict(type='GN', num_groups=32, requires_grad=True), + conv_cfg=None, + loss_seg=dict(type='CrossEntropyLoss', + ignore_index=255, + loss_weight=0.5), + ), + panoptic_fusion_head=dict(type='HeuristicFusionHead', + num_things_classes=80, + num_stuff_classes=53), + test_cfg=dict(panoptic=dict( + score_thr=0.6, + max_per_img=100, + mask_thr_binary=0.5, + mask_overlap=0.5, + nms=dict(type='nms', iou_threshold=0.5, class_agnostic=True), + stuff_area_limit=4096, + )), +) + +custom_hooks = [] + +# Change batch size and learning rate +data = dict(samples_per_gpu=8, + # workers_per_gpu=2 + ) +# optimizer = dict(lr=0.02) +optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) +optimizer_config = dict(_delete_=True, + grad_clip=dict(max_norm=35, norm_type=2)) + +lr_config = dict(policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=1.0 / 3, + step=[8, 11]) + +project_name = 'openpsg' +expt_name = 'panoptic_fpn_r50_fpn_psg' +work_dir = f'./work_dirs/{expt_name}' + +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + dict( + type='WandbLoggerHook', + init_kwargs=dict( + project=project_name, + name=expt_name, + # config=work_dir + "/cfg.yaml" + ), + ), + ], +) + +load_from = 'work_dirs/checkpoints/panoptic_fpn_r50_fpn_1x_coco_20210821_101153-9668fd13.pth' diff --git a/OpenPSG/configs/_base_/models/psgtr_r101.py b/OpenPSG/configs/_base_/models/psgtr_r101.py new file mode 100644 index 0000000000000000000000000000000000000000..28a043e12a54656ed52202a348058bd0dc3d6f9d --- /dev/null +++ b/OpenPSG/configs/_base_/models/psgtr_r101.py @@ -0,0 +1,5 @@ +_base_ = './psgtr_r50.py' + +model = dict(backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101'))) diff --git a/OpenPSG/configs/_base_/models/psgtr_r50.py b/OpenPSG/configs/_base_/models/psgtr_r50.py new file mode 100644 index 0000000000000000000000000000000000000000..96eccd68df077c5de98613fe62d4bcacb5b7f5a4 --- /dev/null +++ b/OpenPSG/configs/_base_/models/psgtr_r50.py @@ -0,0 +1,82 @@ +model = dict( + type='PSGTr', + backbone=dict(type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet50')), + bbox_head=dict(type='PSGTrHead', + num_classes=80, + num_relations=117, + in_channels=2048, + transformer=dict( + type='Transformer', + encoder=dict(type='DetrTransformerEncoder', + num_layers=6, + transformerlayers=dict( + type='BaseTransformerLayer', + attn_cfgs=[ + dict(type='MultiheadAttention', + embed_dims=256, + num_heads=8, + dropout=0.1) + ], + feedforward_channels=2048, + ffn_dropout=0.1, + operation_order=('self_attn', 'norm', + 'ffn', 'norm'))), + decoder=dict( + type='DetrTransformerDecoder', + return_intermediate=True, + num_layers=6, + transformerlayers=dict( + type='DetrTransformerDecoderLayer', + attn_cfgs=dict(type='MultiheadAttention', + embed_dims=256, + num_heads=8, + dropout=0.1), + feedforward_channels=2048, + ffn_dropout=0.1, + operation_order=('self_attn', 'norm', + 'cross_attn', 'norm', 'ffn', + 'norm')), + )), + positional_encoding=dict(type='SinePositionalEncoding', + num_feats=128, + normalize=True), + sub_loss_cls=dict(type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0, + class_weight=1.0), + sub_loss_bbox=dict(type='L1Loss', loss_weight=5.0), + sub_loss_iou=dict(type='GIoULoss', loss_weight=2.0), + sub_focal_loss=dict(type='BCEFocalLoss', loss_weight=1.0), + sub_dice_loss=dict(type='psgtrDiceLoss', loss_weight=1.0), + obj_loss_cls=dict(type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0, + class_weight=1.0), + obj_loss_bbox=dict(type='L1Loss', loss_weight=5.0), + obj_loss_iou=dict(type='GIoULoss', loss_weight=2.0), + obj_focal_loss=dict(type='BCEFocalLoss', loss_weight=1.0), + obj_dice_loss=dict(type='psgtrDiceLoss', loss_weight=1.0), + rel_loss_cls=dict(type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=2.0, + class_weight=1.0)), + # training and testing settings + train_cfg=dict(assigner=dict( + type='HTriMatcher', + s_cls_cost=dict(type='ClassificationCost', weight=1.), + s_reg_cost=dict(type='BBoxL1Cost', weight=5.0), + s_iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0), + o_cls_cost=dict(type='ClassificationCost', weight=1.), + o_reg_cost=dict(type='BBoxL1Cost', weight=5.0), + o_iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0), + r_cls_cost=dict(type='ClassificationCost', weight=2.))), + test_cfg=dict(max_per_img=100)) diff --git a/OpenPSG/configs/_base_/schedules/schedule_1x.py b/OpenPSG/configs/_base_/schedules/schedule_1x.py new file mode 100644 index 0000000000000000000000000000000000000000..1c01d3df3d9169fee87ffeaa4e0fb60ac3f07b66 --- /dev/null +++ b/OpenPSG/configs/_base_/schedules/schedule_1x.py @@ -0,0 +1,10 @@ +# optimizer +optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict(policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[8, 11]) +runner = dict(type='EpochBasedRunner', max_epochs=12) diff --git a/OpenPSG/configs/_base_/schedules/schedule_3x.py b/OpenPSG/configs/_base_/schedules/schedule_3x.py new file mode 100644 index 0000000000000000000000000000000000000000..4109da969702ecb2962606ec3891cedfcd4cd2ae --- /dev/null +++ b/OpenPSG/configs/_base_/schedules/schedule_3x.py @@ -0,0 +1,10 @@ +# optimizer +optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict(policy='step', + warmup='linear', + warmup_iters=1000, + warmup_ratio=0.001, + step=[27, 33]) +runner = dict(type='EpochBasedRunner', max_epochs=36) diff --git a/OpenPSG/configs/gpsnet/panoptic_fpn_r101_fpn_1x_predcls_psg.py b/OpenPSG/configs/gpsnet/panoptic_fpn_r101_fpn_1x_predcls_psg.py new file mode 100644 index 0000000000000000000000000000000000000000..1be5fdcf74eeb3e941ef2829546cfb14338face8 --- /dev/null +++ b/OpenPSG/configs/gpsnet/panoptic_fpn_r101_fpn_1x_predcls_psg.py @@ -0,0 +1,26 @@ +_base_ = './panoptic_fpn_r50_fpn_1x_predcls_psg.py' + +model = dict(backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101'))) + +# Log config +project_name = 'openpsg' +expt_name = 'gpsnet_panoptic_fpn_r101_fpn_1x_predcls_psg' +work_dir = f'./work_dirs/{expt_name}' + +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + dict( + type='WandbLoggerHook', + init_kwargs=dict( + project=project_name, + name=expt_name, + ), + ), + ], +) + +load_from = 'work_dirs/checkpoints/panoptic_fpn_r101_fpn_1x_coco_20210820_193950-ab9157a2.pth' diff --git a/OpenPSG/configs/gpsnet/panoptic_fpn_r101_fpn_1x_sgdet_psg.py b/OpenPSG/configs/gpsnet/panoptic_fpn_r101_fpn_1x_sgdet_psg.py new file mode 100644 index 0000000000000000000000000000000000000000..3ae604515cefc1aa3849ee328c1667408f08cab4 --- /dev/null +++ b/OpenPSG/configs/gpsnet/panoptic_fpn_r101_fpn_1x_sgdet_psg.py @@ -0,0 +1,26 @@ +_base_ = './panoptic_fpn_r50_fpn_1x_sgdet_psg.py' + +model = dict(backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101'))) + +# Log config +project_name = 'openpsg' +expt_name = 'gpsnet_panoptic_fpn_r101_fpn_1x_sgdet_psg' +work_dir = f'./work_dirs/{expt_name}' + +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + dict( + type='WandbLoggerHook', + init_kwargs=dict( + project=project_name, + name=expt_name, + ), + ), + ], +) + +load_from = 'work_dirs/checkpoints/panoptic_fpn_r101_fpn_1x_coco_20210820_193950-ab9157a2.pth' diff --git a/OpenPSG/configs/gpsnet/panoptic_fpn_r50_fpn_1x_predcls_psg.py b/OpenPSG/configs/gpsnet/panoptic_fpn_r50_fpn_1x_predcls_psg.py new file mode 100644 index 0000000000000000000000000000000000000000..cd06ebcd9c19aec5210937600af4db0d66d99def --- /dev/null +++ b/OpenPSG/configs/gpsnet/panoptic_fpn_r50_fpn_1x_predcls_psg.py @@ -0,0 +1,41 @@ +_base_ = [ + '../motifs/panoptic_fpn_r50_fpn_1x_predcls_psg.py', +] + +model = dict(relation_head=dict( + type='GPSHead', + head_config=dict( + # NOTE: Evaluation type + use_gt_box=True, + use_gt_label=True, + ), +)) + +evaluation = dict(interval=1, + metric='predcls', + relation_mode=True, + classwise=True, + detection_method='pan_seg') + +# Change batch size and learning rate +data = dict(samples_per_gpu=16, workers_per_gpu=0) +optimizer = dict(type='SGD', lr=0.03, momentum=0.9, weight_decay=0.0001) + +# Log config +project_name = 'openpsg' +expt_name = 'gpsnet_panoptic_fpn_r50_fpn_1x_predcls_psg' +work_dir = f'./work_dirs/{expt_name}' + +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + dict( + type='WandbLoggerHook', + init_kwargs=dict( + project=project_name, + name=expt_name, + ), + ), + ], +) diff --git a/OpenPSG/configs/gpsnet/panoptic_fpn_r50_fpn_1x_sgdet_psg.py b/OpenPSG/configs/gpsnet/panoptic_fpn_r50_fpn_1x_sgdet_psg.py new file mode 100644 index 0000000000000000000000000000000000000000..78165a4ce56b57819445d8d58840c6f9fca5f4a8 --- /dev/null +++ b/OpenPSG/configs/gpsnet/panoptic_fpn_r50_fpn_1x_sgdet_psg.py @@ -0,0 +1,45 @@ +_base_ = [ + '../motifs/panoptic_fpn_r50_fpn_1x_predcls_psg.py', +] + +model = dict( + relation_head=dict( + type='GPSHead', + head_config=dict( + # NOTE: Evaluation type + use_gt_box=False, + use_gt_label=False, + ), + ), + roi_head=dict(bbox_head=dict(type='SceneGraphBBoxHead'), ), +) + +evaluation = dict( + interval=1, + metric='sgdet', + relation_mode=True, + classwise=True, + iou_thrs=0.5, + detection_method='pan_seg', +) + +data = dict(samples_per_gpu=16) + +# Log config +project_name = 'openpsg' +expt_name = 'gpsnet_panoptic_fpn_r50_fpn_1x_sgdet_psg' +work_dir = f'./work_dirs/{expt_name}' + +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + dict( + type='WandbLoggerHook', + init_kwargs=dict( + project=project_name, + name=expt_name, + ), + ), + ], +) diff --git a/OpenPSG/configs/imp/panoptic_fpn_r101_fpn_1x_predcls_psg.py b/OpenPSG/configs/imp/panoptic_fpn_r101_fpn_1x_predcls_psg.py new file mode 100644 index 0000000000000000000000000000000000000000..28bc0487451535069f9301853e0190fc9025bb85 --- /dev/null +++ b/OpenPSG/configs/imp/panoptic_fpn_r101_fpn_1x_predcls_psg.py @@ -0,0 +1,28 @@ +_base_ = './panoptic_fpn_r50_fpn_1x_predcls_psg.py' + +model = dict(backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101'))) + +# Log config +project_name = 'openpsg' +expt_name = 'imp_panoptic_fpn_r101_fpn_1x_predcls_psg' +work_dir = f'./work_dirs/{expt_name}' + +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + dict( + type='WandbLoggerHook', + init_kwargs=dict( + project=project_name, + name=expt_name, + # config=work_dir + "/cfg.yaml" + ), + ), + ], +) + +load_from = 'work_dirs/checkpoints/panoptic_fpn_r101_fpn_1x_coco_20210820_193950-ab9157a2.pth' diff --git a/OpenPSG/configs/imp/panoptic_fpn_r101_fpn_1x_sgdet_psg.py b/OpenPSG/configs/imp/panoptic_fpn_r101_fpn_1x_sgdet_psg.py new file mode 100644 index 0000000000000000000000000000000000000000..7f0f96866d423e0f6a214e98462c721626744309 --- /dev/null +++ b/OpenPSG/configs/imp/panoptic_fpn_r101_fpn_1x_sgdet_psg.py @@ -0,0 +1,26 @@ +_base_ = './panoptic_fpn_r50_fpn_1x_sgdet_psg.py' + +model = dict(backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101'))) + +# Log config +project_name = 'openpsg' +expt_name = 'imp_panoptic_fpn_r101_fpn_1x_sgdet_psg' +work_dir = f'./work_dirs/{expt_name}' + +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + dict( + type='WandbLoggerHook', + init_kwargs=dict( + project=project_name, + name=expt_name, + ), + ), + ], +) + +load_from = 'work_dirs/checkpoints/panoptic_fpn_r101_fpn_1x_coco_20210820_193950-ab9157a2.pth' diff --git a/OpenPSG/configs/imp/panoptic_fpn_r50_fpn_1x_predcls_psg.py b/OpenPSG/configs/imp/panoptic_fpn_r50_fpn_1x_predcls_psg.py new file mode 100644 index 0000000000000000000000000000000000000000..93189cfd37a51374fe62e29b0bc8550559da3a27 --- /dev/null +++ b/OpenPSG/configs/imp/panoptic_fpn_r50_fpn_1x_predcls_psg.py @@ -0,0 +1,44 @@ +_base_ = [ + '../motifs/panoptic_fpn_r50_fpn_1x_predcls_psg.py', +] + +model = dict(relation_head=dict( + type='IMPHead', + head_config=dict( + # NOTE: Evaluation type + use_gt_box=True, + use_gt_label=True, + num_iter=2, + ), +)) + +evaluation = dict(interval=1, + metric='predcls', + relation_mode=True, + classwise=True) + +# Change batch size and learning rate +data = dict(samples_per_gpu=16, ) +# workers_per_gpu=0) # FIXME: Is this the problem? +optimizer = dict(type='SGD', lr=0.001, momentum=0.9) + +# Log config +project_name = 'openpsg' +expt_name = 'imp_panoptic_fpn_r50_fpn_1x_predcls_psg' +work_dir = f'./work_dirs/{expt_name}' + +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + dict( + type='WandbLoggerHook', + init_kwargs=dict( + project=project_name, + name=expt_name, + # config=work_dir + "/cfg.yaml" + ), + ), + ], +) diff --git a/OpenPSG/configs/imp/panoptic_fpn_r50_fpn_1x_sgdet_psg.py b/OpenPSG/configs/imp/panoptic_fpn_r50_fpn_1x_sgdet_psg.py new file mode 100644 index 0000000000000000000000000000000000000000..1ec83492bfccc1b706723b6de680392f9b0e2c7a --- /dev/null +++ b/OpenPSG/configs/imp/panoptic_fpn_r50_fpn_1x_sgdet_psg.py @@ -0,0 +1,48 @@ +_base_ = [ + '../motifs/panoptic_fpn_r50_fpn_1x_predcls_psg.py', +] + +model = dict(relation_head=dict( + type='IMPHead', + head_config=dict( + # NOTE: Evaluation type + use_gt_box=False, + use_gt_label=False, + num_iter=2, + ), +)) + +evaluation = dict( + interval=1, + metric='sgdet', + relation_mode=True, + classwise=True, + iou_thrs=0.5, + detection_method='pan_seg', +) + +# Change batch size and learning rate +data = dict(samples_per_gpu=16, ) +# workers_per_gpu=0) # FIXME: Is this the problem? +optimizer = dict(type='SGD', lr=0.001, momentum=0.9) + +# Log config +project_name = 'openpsg' +expt_name = 'imp_panoptic_fpn_r50_fpn_1x_sgdet_psg' +work_dir = f'./work_dirs/{expt_name}' + +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + dict( + type='WandbLoggerHook', + init_kwargs=dict( + project=project_name, + name=expt_name, + # config=work_dir + "/cfg.yaml" + ), + ), + ], +) diff --git a/OpenPSG/configs/motifs/panoptic_fpn_r101_fpn_1x_predcls_psg.py b/OpenPSG/configs/motifs/panoptic_fpn_r101_fpn_1x_predcls_psg.py new file mode 100644 index 0000000000000000000000000000000000000000..d125d475b96e26c7862d16b5335798ee9defab44 --- /dev/null +++ b/OpenPSG/configs/motifs/panoptic_fpn_r101_fpn_1x_predcls_psg.py @@ -0,0 +1,28 @@ +_base_ = './panoptic_fpn_r50_fpn_1x_predcls_psg.py' + +model = dict(backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101'))) + +# Log config +project_name = 'openpsg' +expt_name = 'motifs_panoptic_fpn_r101_fpn_1x_predcls_psg' +work_dir = f'./work_dirs/{expt_name}' + +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + dict( + type='WandbLoggerHook', + init_kwargs=dict( + project=project_name, + name=expt_name, + # config=work_dir + "/cfg.yaml" + ), + ), + ], +) + +load_from = 'work_dirs/checkpoints/panoptic_fpn_r101_fpn_1x_coco_20210820_193950-ab9157a2.pth' diff --git a/OpenPSG/configs/motifs/panoptic_fpn_r101_fpn_1x_sgdet_psg.py b/OpenPSG/configs/motifs/panoptic_fpn_r101_fpn_1x_sgdet_psg.py new file mode 100644 index 0000000000000000000000000000000000000000..55b1f9eadee9904706504b57f896e2e6482d6385 --- /dev/null +++ b/OpenPSG/configs/motifs/panoptic_fpn_r101_fpn_1x_sgdet_psg.py @@ -0,0 +1,28 @@ +_base_ = './panoptic_fpn_r50_fpn_1x_sgdet_psg.py' + +model = dict(backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101'))) + +# Log config +project_name = 'openpsg' +expt_name = 'motifs_panoptic_fpn_r101_fpn_1x_sgdet_psg' +work_dir = f'./work_dirs/{expt_name}' + +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + dict( + type='WandbLoggerHook', + init_kwargs=dict( + project=project_name, + name=expt_name, + # config=work_dir + "/cfg.yaml" + ), + ), + ], +) + +load_from = 'work_dirs/checkpoints/panoptic_fpn_r101_fpn_1x_coco_20210820_193950-ab9157a2.pth' diff --git a/OpenPSG/configs/motifs/panoptic_fpn_r50_fpn_1x_predcls_psg.py b/OpenPSG/configs/motifs/panoptic_fpn_r50_fpn_1x_predcls_psg.py new file mode 100644 index 0000000000000000000000000000000000000000..72e2afc7e139a93749fcb28f8f8a7b4c3612478d --- /dev/null +++ b/OpenPSG/configs/motifs/panoptic_fpn_r50_fpn_1x_predcls_psg.py @@ -0,0 +1,241 @@ +_base_ = [ + '../_base_/models/mask_rcnn_r50_fpn.py', + '../_base_/datasets/psg.py', + '../_base_/schedules/schedule_1x.py', + '../_base_/custom_runtime.py', +] + +find_unused_parameters = True +dataset_type = 'PanopticSceneGraphDataset' + +# HACK: +object_classes = [ + 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', + 'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign', + 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', + 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', + 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', + 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', + 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', + 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', + 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant', + 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', + 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', + 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear', + 'hair drier', 'toothbrush', 'banner', 'blanket', 'bridge', 'cardboard', + 'counter', 'curtain', 'door-stuff', 'floor-wood', 'flower', 'fruit', + 'gravel', 'house', 'light', 'mirror-stuff', 'net', 'pillow', 'platform', + 'playingfield', 'railroad', 'river', 'road', 'roof', 'sand', 'sea', + 'shelf', 'snow', 'stairs', 'tent', 'towel', 'wall-brick', 'wall-stone', + 'wall-tile', 'wall-wood', 'water-other', 'window-blind', 'window-other', + 'tree-merged', 'fence-merged', 'ceiling-merged', 'sky-other-merged', + 'cabinet-merged', 'table-merged', 'floor-other-merged', 'pavement-merged', + 'mountain-merged', 'grass-merged', 'dirt-merged', 'paper-merged', + 'food-other-merged', 'building-other-merged', 'rock-merged', + 'wall-other-merged', 'rug-merged' +] + +predicate_classes = [ + 'over', + 'in front of', + 'beside', + 'on', + 'in', + 'attached to', + 'hanging from', + 'on back of', + 'falling off', + 'going down', + 'painted on', + 'walking on', + 'running on', + 'crossing', + 'standing on', + 'lying on', + 'sitting on', + 'flying over', + 'jumping over', + 'jumping from', + 'wearing', + 'holding', + 'carrying', + 'looking at', + 'guiding', + 'kissing', + 'eating', + 'drinking', + 'feeding', + 'biting', + 'catching', + 'picking', + 'playing with', + 'chasing', + 'climbing', + 'cleaning', + 'playing', + 'touching', + 'pushing', + 'pulling', + 'opening', + 'cooking', + 'talking to', + 'throwing', + 'slicing', + 'driving', + 'riding', + 'parked on', + 'driving on', + 'about to hit', + 'kicking', + 'swinging', + 'entering', + 'exiting', + 'enclosing', + 'leaning on', +] + +model = dict( + type='SceneGraphPanopticFPN', + semantic_head=dict( + type='PanopticFPNHead', + num_things_classes=80, + num_stuff_classes=53, + in_channels=256, + inner_channels=128, + start_level=0, + end_level=4, + norm_cfg=dict(type='GN', num_groups=32, requires_grad=True), + conv_cfg=None, + loss_seg=dict(type='CrossEntropyLoss', + ignore_index=255, + loss_weight=0.5), + ), + panoptic_fusion_head=dict(type='HeuristicFusionHead', + num_things_classes=80, + num_stuff_classes=53), + test_cfg=dict(panoptic=dict( + score_thr=0.6, + max_per_img=100, + mask_thr_binary=0.5, + mask_overlap=0.5, + nms=dict(type='nms', iou_threshold=0.5, class_agnostic=True), + stuff_area_limit=4096, + )), + relation_head=dict( + type='MotifHead', + object_classes=object_classes, + predicate_classes=predicate_classes, + num_classes=len(object_classes) + 1, # with background class + num_predicates=len(predicate_classes) + 1, + use_bias=False, # NOTE: whether to use frequency bias + head_config=dict( + # NOTE: Evaluation type + use_gt_box=True, + use_gt_label=True, + use_vision=True, + embed_dim=200, + hidden_dim=512, + roi_dim=1024, + context_pooling_dim=4096, + dropout_rate=0.2, + context_object_layer=1, + context_edge_layer=1, + glove_dir='data/glove/', + causal_effect_analysis=False, + ), + bbox_roi_extractor=dict( + type='VisualSpatialExtractor', + bbox_roi_layer=dict(type='RoIAlign', + output_size=7, + sampling_ratio=2), + with_visual_bbox=True, + with_visual_mask=False, + with_visual_point=False, + with_spatial=False, + in_channels=256, + fc_out_channels=1024, + featmap_strides=[4, 8, 16, 32], + ), + relation_roi_extractor=dict( + type='VisualSpatialExtractor', + bbox_roi_layer=dict(type='RoIAlign', + output_size=7, + sampling_ratio=2), + with_visual_bbox=True, + with_visual_mask=False, + with_visual_point=False, + with_spatial=True, + separate_spatial=False, + in_channels=256, + fc_out_channels=1024, + featmap_strides=[4, 8, 16, 32], + ), + relation_sampler=dict( + type='Motif', + pos_iou_thr=0.5, + require_overlap=False, # for sgdet training, not require + num_sample_per_gt_rel=4, + num_rel_per_image=1024, + pos_fraction=0.25, + # NOTE: To only include overlapping bboxes? + test_overlap=False, # for testing + ), + loss_object=dict(type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0), + loss_relation=dict(type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0), + ), +) + +custom_hooks = [] + +# To freeze modules +freeze_modules = [ + 'backbone', + 'neck', + 'rpn_head', + 'roi_head', + 'semantic_head', + 'panoptic_fusion_head', +] + +evaluation = dict(interval=1, + metric='predcls', + relation_mode=True, + classwise=True) + +# Change batch size and learning rate +data = dict(samples_per_gpu=16, ) +# optimizer = dict(lr=0.003) +optimizer = dict(type='SGD', lr=0.03, momentum=0.9, weight_decay=0.0001) +optimizer_config = dict(_delete_=True, + grad_clip=dict(max_norm=35, norm_type=2)) + +lr_config = dict(policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=1.0 / 3, + step=[7, 10]) + +# Log config +project_name = 'openpsg' +expt_name = 'motifs_panoptic_fpn_r50_fpn_1x_predcls_psg' +work_dir = f'./work_dirs/{expt_name}' + +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + dict( + type='WandbLoggerHook', + init_kwargs=dict( + project=project_name, + name=expt_name, + ), + ), + ], +) + +load_from = 'work_dirs/checkpoints/panoptic_fpn_r50_fpn_1x_coco_20210821_101153-9668fd13.pth' diff --git a/OpenPSG/configs/motifs/panoptic_fpn_r50_fpn_1x_sgdet_psg.py b/OpenPSG/configs/motifs/panoptic_fpn_r50_fpn_1x_sgdet_psg.py new file mode 100644 index 0000000000000000000000000000000000000000..55586140a9723c83b0b347bbfde042822ae8618b --- /dev/null +++ b/OpenPSG/configs/motifs/panoptic_fpn_r50_fpn_1x_sgdet_psg.py @@ -0,0 +1,44 @@ +_base_ = [ + './panoptic_fpn_r50_fpn_1x_predcls_psg.py', +] + +model = dict( + relation_head=dict( + head_config=dict( + # NOTE: Evaluation type + use_gt_box=False, + use_gt_label=False, + ), ), + roi_head=dict(bbox_head=dict(type='SceneGraphBBoxHead'), ), +) + +evaluation = dict(interval=1, + metric='sgdet', + relation_mode=True, + classwise=True, + iou_thrs=0.5, + detection_method='pan_seg') + +# Change batch size and learning rate +data = dict(samples_per_gpu=8, + # workers_per_gpu=2 + ) + +# Log config +project_name = 'openpsg' +expt_name = 'motifs_panoptic_fpn_r50_fpn_1x_sgdet_psg' +work_dir = f'./work_dirs/{expt_name}' + +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + dict( + type='WandbLoggerHook', + init_kwargs=dict( + project=project_name, + name=expt_name, + ), + ), + ], +) diff --git a/OpenPSG/configs/psgformer/psgformer_r101_psg.py b/OpenPSG/configs/psgformer/psgformer_r101_psg.py new file mode 100644 index 0000000000000000000000000000000000000000..7055248f2307ca9b32f7efe3c6a65f118019a0c7 --- /dev/null +++ b/OpenPSG/configs/psgformer/psgformer_r101_psg.py @@ -0,0 +1,16 @@ +_base_ = './psgformer_r50_psg.py' + +model = dict(backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101'))) + +# learning policy +lr_config = dict(policy='step', step=48) +runner = dict(type='EpochBasedRunner', max_epochs=60) + +project_name = 'psgformer' +expt_name = 'psgformer_r101_psg' +work_dir = f'./work_dirs/{expt_name}' +checkpoint_config = dict(interval=12, max_keep_ckpts=10) + +load_from = './work_dirs/checkpoints/detr4psgformer_r101.pth' diff --git a/OpenPSG/configs/psgformer/psgformer_r50.py b/OpenPSG/configs/psgformer/psgformer_r50.py new file mode 100644 index 0000000000000000000000000000000000000000..31f77e61bf46c57f8b064ca94d6a5d35b8008411 --- /dev/null +++ b/OpenPSG/configs/psgformer/psgformer_r50.py @@ -0,0 +1,96 @@ +model = dict( + type='PSGTr', + backbone=dict(type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet50')), + bbox_head=dict( + type='PSGFormerHead', + num_classes=80, + num_relations=117, + in_channels=2048, + transformer=dict( + type='DualTransformer', + encoder=dict(type='DetrTransformerEncoder', + num_layers=6, + transformerlayers=dict( + type='BaseTransformerLayer', + attn_cfgs=[ + dict(type='MultiheadAttention', + embed_dims=256, + num_heads=8, + dropout=0.1) + ], + feedforward_channels=2048, + ffn_dropout=0.1, + operation_order=('self_attn', 'norm', 'ffn', + 'norm'))), + decoder1=dict(type='DetrTransformerDecoder', + return_intermediate=True, + num_layers=6, + transformerlayers=dict( + type='DetrTransformerDecoderLayer', + attn_cfgs=dict(type='MultiheadAttention', + embed_dims=256, + num_heads=8, + dropout=0.1), + feedforward_channels=2048, + ffn_dropout=0.1, + operation_order=('self_attn', 'norm', + 'cross_attn', 'norm', 'ffn', + 'norm'))), + decoder2=dict(type='DetrTransformerDecoder', + return_intermediate=True, + num_layers=6, + transformerlayers=dict( + type='DetrTransformerDecoderLayer', + attn_cfgs=dict(type='MultiheadAttention', + embed_dims=256, + num_heads=8, + dropout=0.1), + feedforward_channels=2048, + ffn_dropout=0.1, + operation_order=('self_attn', 'norm', + 'cross_attn', 'norm', 'ffn', + 'norm'))), + ), + positional_encoding=dict(type='SinePositionalEncoding', + num_feats=128, + normalize=True), + rel_loss_cls=dict(type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=2.0, + class_weight=1.0), + sub_id_loss=dict(type='MultilabelCrossEntropy', loss_weight=2.0), + obj_id_loss=dict(type='MultilabelCrossEntropy', loss_weight=2.0), + loss_cls=dict(type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=4.0, + class_weight=1.0), + loss_bbox=dict(type='L1Loss', loss_weight=3.0), + loss_iou=dict(type='GIoULoss', loss_weight=2.0), + focal_loss=dict(type='BCEFocalLoss', loss_weight=1.0), + dice_loss=dict(type='psgtrDiceLoss', loss_weight=1.0)), + # training and testing settings + train_cfg=dict(id_assigner=dict(type='IdMatcher', + sub_id_cost=dict(type='ClassificationCost', + weight=1.), + obj_id_cost=dict(type='ClassificationCost', + weight=1.), + r_cls_cost=dict(type='ClassificationCost', + weight=1.)), + bbox_assigner=dict(type='HungarianAssigner', + cls_cost=dict(type='ClassificationCost', + weight=4.0), + reg_cost=dict(type='BBoxL1Cost', + weight=3.0), + iou_cost=dict(type='IoUCost', + iou_mode='giou', + weight=2.0))), + test_cfg=dict(max_per_img=100)) diff --git a/OpenPSG/configs/psgformer/psgformer_r50_psg.py b/OpenPSG/configs/psgformer/psgformer_r50_psg.py new file mode 100644 index 0000000000000000000000000000000000000000..6452d39335427fe40de8c8a869dedeb5992da2f9 --- /dev/null +++ b/OpenPSG/configs/psgformer/psgformer_r50_psg.py @@ -0,0 +1,244 @@ +_base_ = [ + './psgformer_r50.py', '../_base_/datasets/psg.py', + '../_base_/custom_runtime.py' +] + +find_unused_parameters = True + +custom_imports = dict(imports=[ + 'openpsg.models.frameworks.psgtr', 'openpsg.models.losses.seg_losses', + 'openpsg.models.frameworks.dual_transformer', + 'openpsg.models.relation_heads.psgformer_head', 'openpsg.datasets', + 'openpsg.datasets.pipelines.loading', + 'openpsg.datasets.pipelines.rel_randomcrop', + 'openpsg.models.relation_heads.approaches.matcher', 'openpsg.utils' +], + allow_failed_imports=False) + +dataset_type = 'PanopticSceneGraphDataset' + +# HACK: +object_classes = [ + 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', + 'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign', + 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', + 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', + 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', + 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', + 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', + 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', + 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant', + 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', + 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', + 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear', + 'hair drier', 'toothbrush', 'banner', 'blanket', 'bridge', 'cardboard', + 'counter', 'curtain', 'door-stuff', 'floor-wood', 'flower', 'fruit', + 'gravel', 'house', 'light', 'mirror-stuff', 'net', 'pillow', 'platform', + 'playingfield', 'railroad', 'river', 'road', 'roof', 'sand', 'sea', + 'shelf', 'snow', 'stairs', 'tent', 'towel', 'wall-brick', 'wall-stone', + 'wall-tile', 'wall-wood', 'water-other', 'window-blind', 'window-other', + 'tree-merged', 'fence-merged', 'ceiling-merged', 'sky-other-merged', + 'cabinet-merged', 'table-merged', 'floor-other-merged', 'pavement-merged', + 'mountain-merged', 'grass-merged', 'dirt-merged', 'paper-merged', + 'food-other-merged', 'building-other-merged', 'rock-merged', + 'wall-other-merged', 'rug-merged' +] + +predicate_classes = [ + 'over', + 'in front of', + 'beside', + 'on', + 'in', + 'attached to', + 'hanging from', + 'on back of', + 'falling off', + 'going down', + 'painted on', + 'walking on', + 'running on', + 'crossing', + 'standing on', + 'lying on', + 'sitting on', + 'flying over', + 'jumping over', + 'jumping from', + 'wearing', + 'holding', + 'carrying', + 'looking at', + 'guiding', + 'kissing', + 'eating', + 'drinking', + 'feeding', + 'biting', + 'catching', + 'picking', + 'playing with', + 'chasing', + 'climbing', + 'cleaning', + 'playing', + 'touching', + 'pushing', + 'pulling', + 'opening', + 'cooking', + 'talking to', + 'throwing', + 'slicing', + 'driving', + 'riding', + 'parked on', + 'driving on', + 'about to hit', + 'kicking', + 'swinging', + 'entering', + 'exiting', + 'enclosing', + 'leaning on', +] + +model = dict(bbox_head=dict( + num_classes=len(object_classes), + num_relations=len(predicate_classes), + object_classes=object_classes, + predicate_classes=predicate_classes, + num_obj_query=100, + num_rel_query=100, +), ) + +img_norm_cfg = dict(mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + to_rgb=True) +# train_pipeline, NOTE the img_scale and the Pad's size_divisor is different +# from the default setting in mmdet. +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadPanopticSceneGraphAnnotations', + with_bbox=True, + with_rel=True, + with_mask=True, + with_seg=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict( + type='AutoAugment', + policies=[ + [ + dict(type='Resize', + img_scale=[(480, 1333), (512, 1333), (544, 1333), + (576, 1333), (608, 1333), (640, 1333), + (672, 1333), (704, 1333), (736, 1333), + (768, 1333), (800, 1333)], + multiscale_mode='value', + keep_ratio=True) + ], + [ + dict(type='Resize', + img_scale=[(400, 1333), (500, 1333), (600, 1333)], + multiscale_mode='value', + keep_ratio=True), + dict(type='RelRandomCrop', + crop_type='absolute_range', + crop_size=(384, 600), + allow_negative_crop=False), # no empty relations + dict(type='Resize', + img_scale=[(480, 1333), (512, 1333), (544, 1333), + (576, 1333), (608, 1333), (640, 1333), + (672, 1333), (704, 1333), (736, 1333), + (768, 1333), (800, 1333)], + multiscale_mode='value', + override=True, + keep_ratio=True) + ] + ]), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=1), + dict(type='RelsFormatBundle'), + dict(type='Collect', + keys=['img', 'gt_bboxes', 'gt_labels', 'gt_rels', 'gt_masks']) +] +# test_pipeline, NOTE the Pad's size_divisor is different from the default +# setting (size_divisor=32). While there is little effect on the performance +# whether we use the default setting or use size_divisor=1. +test_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadSceneGraphAnnotations', with_bbox=True, with_rel=True), + dict(type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=1), + dict(type='ImageToTensor', keys=['img']), + dict(type='ToTensor', keys=['gt_bboxes', 'gt_labels']), + dict(type='ToDataContainer', + fields=(dict(key='gt_bboxes'), dict(key='gt_labels'))), + dict(type='Collect', keys=['img']), + ]) +] + +evaluation = dict( + interval=1, + metric='sgdet', + relation_mode=True, + classwise=True, + iou_thrs=0.5, + detection_method='pan_seg', +) + +data = dict(samples_per_gpu=1, + workers_per_gpu=2, + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) +# optimizer +optimizer = dict( + type='AdamW', + lr=0.001, + weight_decay=0.0001, + paramwise_cfg=dict( + custom_keys={ + 'backbone': dict(lr_mult=0.1, decay_mult=1.0), + 'transformer.encoder': dict(lr_mult=0.1, decay_mult=1.0), + 'transformer.decoder1': dict(lr_mult=0.1, decay_mult=1.0), + 'obj_query_embed': dict(lr_mult=0.1, decay_mult=1.0), + 'input_proj': dict(lr_mult=0.1, decay_mult=1.0), + 'class_embed': dict(lr_mult=0.1, decay_mult=1.0), + 'box_embed': dict(lr_mult=0.1, decay_mult=1.0), + 'bbox_attention': dict(lr_mult=0.1, decay_mult=1.0), + 'mask_head': dict(lr_mult=0.1, decay_mult=1.0), + })) + +optimizer_config = dict(grad_clip=dict(max_norm=0.1, norm_type=2)) + +# learning policy +lr_config = dict(policy='step', step=40) +runner = dict(type='EpochBasedRunner', max_epochs=60) + +project_name = 'psgformer' +expt_name = 'psgformer_r50_psg' +work_dir = f'./work_dirs/{expt_name}' +checkpoint_config = dict(interval=1, max_keep_ckpts=15) + +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + dict( + type='WandbLoggerHook', + init_kwargs=dict( + project=project_name, + name=expt_name, + ), + ) + ], +) + +load_from = './work_dirs/checkpoints/detr4psgformer_r50.pth' diff --git a/OpenPSG/configs/psgformer/psgformer_r50_psg_inference.py b/OpenPSG/configs/psgformer/psgformer_r50_psg_inference.py new file mode 100644 index 0000000000000000000000000000000000000000..37bebaf42627dc17503986567b18fc6a9770f427 --- /dev/null +++ b/OpenPSG/configs/psgformer/psgformer_r50_psg_inference.py @@ -0,0 +1,31 @@ +_base_ = [ + './psgformer_r50_psg.py' +] + +img_norm_cfg = dict(mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + to_rgb=True) +pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + # NOTE: Do not change the img to DC. + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + + ], + ), +] + +data = dict( + test=dict( + pipeline=pipeline, + ), +) \ No newline at end of file diff --git a/OpenPSG/configs/psgtr/psgtr_r101_psg.py b/OpenPSG/configs/psgtr/psgtr_r101_psg.py new file mode 100644 index 0000000000000000000000000000000000000000..916dc05998c72b83fb5c3221be10af3f5a7f7827 --- /dev/null +++ b/OpenPSG/configs/psgtr/psgtr_r101_psg.py @@ -0,0 +1,231 @@ +_base_ = [ + '../_base_/models/psgtr_r101.py', '../_base_/datasets/psg.py', + '../_base_/custom_runtime.py' +] + +custom_imports = dict(imports=[ + 'openpsg.models.frameworks.psgtr', 'openpsg.models.losses.seg_losses', + 'openpsg.models.relation_heads.psgtr_head', 'openpsg.datasets', + 'openpsg.datasets.pipelines.loading', + 'openpsg.datasets.pipelines.rel_randomcrop', + 'openpsg.models.relation_heads.approaches.matcher', 'openpsg.utils' +], + allow_failed_imports=False) + +dataset_type = 'PanopticSceneGraphDataset' + +# HACK: +object_classes = [ + 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', + 'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign', + 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', + 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', + 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', + 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', + 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', + 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', + 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant', + 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', + 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', + 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear', + 'hair drier', 'toothbrush', 'banner', 'blanket', 'bridge', 'cardboard', + 'counter', 'curtain', 'door-stuff', 'floor-wood', 'flower', 'fruit', + 'gravel', 'house', 'light', 'mirror-stuff', 'net', 'pillow', 'platform', + 'playingfield', 'railroad', 'river', 'road', 'roof', 'sand', 'sea', + 'shelf', 'snow', 'stairs', 'tent', 'towel', 'wall-brick', 'wall-stone', + 'wall-tile', 'wall-wood', 'water-other', 'window-blind', 'window-other', + 'tree-merged', 'fence-merged', 'ceiling-merged', 'sky-other-merged', + 'cabinet-merged', 'table-merged', 'floor-other-merged', 'pavement-merged', + 'mountain-merged', 'grass-merged', 'dirt-merged', 'paper-merged', + 'food-other-merged', 'building-other-merged', 'rock-merged', + 'wall-other-merged', 'rug-merged' +] + +predicate_classes = [ + 'over', + 'in front of', + 'beside', + 'on', + 'in', + 'attached to', + 'hanging from', + 'on back of', + 'falling off', + 'going down', + 'painted on', + 'walking on', + 'running on', + 'crossing', + 'standing on', + 'lying on', + 'sitting on', + 'flying over', + 'jumping over', + 'jumping from', + 'wearing', + 'holding', + 'carrying', + 'looking at', + 'guiding', + 'kissing', + 'eating', + 'drinking', + 'feeding', + 'biting', + 'catching', + 'picking', + 'playing with', + 'chasing', + 'climbing', + 'cleaning', + 'playing', + 'touching', + 'pushing', + 'pulling', + 'opening', + 'cooking', + 'talking to', + 'throwing', + 'slicing', + 'driving', + 'riding', + 'parked on', + 'driving on', + 'about to hit', + 'kicking', + 'swinging', + 'entering', + 'exiting', + 'enclosing', + 'leaning on', +] + +model = dict(bbox_head=dict( + num_classes=len(object_classes), + num_relations=len(predicate_classes), + object_classes=object_classes, + predicate_classes=predicate_classes, + use_mask=True, + num_query=100, +), ) + +img_norm_cfg = dict(mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + to_rgb=True) +# train_pipeline, NOTE the img_scale and the Pad's size_divisor is different +# from the default setting in mmdet. +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadPanopticSceneGraphAnnotations', + with_bbox=True, + with_rel=True, + with_mask=True, + with_seg=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict( + type='AutoAugment', + policies=[ + [ + dict(type='Resize', + img_scale=[(480, 1333), (512, 1333), (544, 1333), + (576, 1333), (608, 1333), (640, 1333), + (672, 1333), (704, 1333), (736, 1333), + (768, 1333), (800, 1333)], + multiscale_mode='value', + keep_ratio=True) + ], + [ + dict(type='Resize', + img_scale=[(400, 1333), (500, 1333), (600, 1333)], + multiscale_mode='value', + keep_ratio=True), + dict(type='RelRandomCrop', + crop_type='absolute_range', + crop_size=(384, 600), + allow_negative_crop=False), # no empty relations + dict(type='Resize', + img_scale=[(480, 1333), (512, 1333), (544, 1333), + (576, 1333), (608, 1333), (640, 1333), + (672, 1333), (704, 1333), (736, 1333), + (768, 1333), (800, 1333)], + multiscale_mode='value', + override=True, + keep_ratio=True) + ] + ]), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=1), + dict(type='RelsFormatBundle'), + dict(type='Collect', + keys=['img', 'gt_bboxes', 'gt_labels', 'gt_rels', 'gt_masks']) +] +# test_pipeline, NOTE the Pad's size_divisor is different from the default +# setting (size_divisor=32). While there is little effect on the performance +# whether we use the default setting or use size_divisor=1. +test_pipeline = [ + dict(type='LoadImageFromFile'), + # dict(type='LoadSceneGraphAnnotations', with_bbox=True, with_rel=True), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=1), + dict(type='ImageToTensor', keys=['img']), + # dict(type='ToTensor', keys=['gt_bboxes', 'gt_labels']), + # dict(type='ToDataContainer', fields=(dict(key='gt_bboxes'), dict(key='gt_labels'))), + dict(type='Collect', keys=['img']), + ]) +] + +evaluation = dict( + interval=1, + metric='sgdet', + relation_mode=True, + classwise=True, + iou_thrs=0.5, + detection_method='pan_seg', +) + +data = dict(samples_per_gpu=1, + workers_per_gpu=2, + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) +# optimizer +optimizer = dict( + type='AdamW', + lr=0.0001, + weight_decay=0.0001, + paramwise_cfg=dict(custom_keys={ + 'backbone': dict(lr_mult=0.1, decay_mult=1.0), + })) +optimizer_config = dict(grad_clip=dict(max_norm=0.1, norm_type=2)) + +# learning policy +lr_config = dict(policy='step', step=40) +runner = dict(type='EpochBasedRunner', max_epochs=60) + +project_name = 'psgtr' +expt_name = 'psgtr_r101_psg' +work_dir = f'./work_dirs/{expt_name}' +checkpoint_config = dict(interval=2, max_keep_ckpts=10) + +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + dict( + type='WandbLoggerHook', + init_kwargs=dict( + project=project_name, + name=expt_name, + ), + ) + ], +) + +load_from = 'work_dirs/checkpoints/detr_pan_r101.pth' diff --git a/OpenPSG/configs/psgtr/psgtr_r50.py b/OpenPSG/configs/psgtr/psgtr_r50.py new file mode 100644 index 0000000000000000000000000000000000000000..c8827bbb9461a34a9d894c2aee9fb6286503898d --- /dev/null +++ b/OpenPSG/configs/psgtr/psgtr_r50.py @@ -0,0 +1,82 @@ +model = dict( + type='PSGTr', + backbone=dict(type='ResNet', + depth=50, + num_stages=4, + out_indices=(0, 1, 2, 3), + frozen_stages=1, + norm_cfg=dict(type='BN', requires_grad=False), + norm_eval=True, + style='pytorch', + init_cfg=dict(type='Pretrained', + checkpoint='torchvision://resnet50')), + bbox_head=dict(type='PSGTrHead', + num_classes=80, + num_relations=117, + in_channels=2048, + transformer=dict( + type='Transformer', + encoder=dict(type='DetrTransformerEncoder', + num_layers=6, + transformerlayers=dict( + type='BaseTransformerLayer', + attn_cfgs=[ + dict(type='MultiheadAttention', + embed_dims=256, + num_heads=8, + dropout=0.1) + ], + feedforward_channels=2048, + ffn_dropout=0.1, + operation_order=('self_attn', 'norm', + 'ffn', 'norm'))), + decoder=dict( + type='DetrTransformerDecoder', + return_intermediate=True, + num_layers=6, + transformerlayers=dict( + type='DetrTransformerDecoderLayer', + attn_cfgs=dict(type='MultiheadAttention', + embed_dims=256, + num_heads=8, + dropout=0.1), + feedforward_channels=2048, + ffn_dropout=0.1, + operation_order=('self_attn', 'norm', + 'cross_attn', 'norm', 'ffn', + 'norm')), + )), + positional_encoding=dict(type='SinePositionalEncoding', + num_feats=128, + normalize=True), + sub_loss_cls=dict(type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0, + class_weight=1.0), + sub_loss_bbox=dict(type='L1Loss', loss_weight=5.0), + sub_loss_iou=dict(type='GIoULoss', loss_weight=2.0), + sub_focal_loss=dict(type='BCEFocalLoss', loss_weight=2.0), + sub_dice_loss=dict(type='psgtrDiceLoss', loss_weight=2.0), + obj_loss_cls=dict(type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0, + class_weight=1.0), + obj_loss_bbox=dict(type='L1Loss', loss_weight=5.0), + obj_loss_iou=dict(type='GIoULoss', loss_weight=2.0), + obj_focal_loss=dict(type='BCEFocalLoss', loss_weight=2.0), + obj_dice_loss=dict(type='psgtrDiceLoss', loss_weight=2.0), + rel_loss_cls=dict(type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=2.0, + class_weight=1.0)), + # training and testing settings + train_cfg=dict(assigner=dict( + type='HTriMatcher', + s_cls_cost=dict(type='ClassificationCost', weight=1.), + s_reg_cost=dict(type='BBoxL1Cost', weight=5.0), + s_iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0), + o_cls_cost=dict(type='ClassificationCost', weight=1.), + o_reg_cost=dict(type='BBoxL1Cost', weight=5.0), + o_iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0), + r_cls_cost=dict(type='ClassificationCost', weight=2.))), + test_cfg=dict(max_per_img=100)) diff --git a/OpenPSG/configs/psgtr/psgtr_r50_psg.py b/OpenPSG/configs/psgtr/psgtr_r50_psg.py new file mode 100644 index 0000000000000000000000000000000000000000..6440149836d4eadd912b5c00412e247ee4637e68 --- /dev/null +++ b/OpenPSG/configs/psgtr/psgtr_r50_psg.py @@ -0,0 +1,233 @@ +_base_ = [ + '../_base_/models/psgtr_r50.py', '../_base_/datasets/psg.py', + '../_base_/custom_runtime.py' +] + +custom_imports = dict(imports=[ + 'openpsg.models.frameworks.psgtr', 'openpsg.models.losses.seg_losses', + 'openpsg.models.relation_heads.psgtr_head', 'openpsg.datasets', + 'openpsg.datasets.pipelines.loading', + 'openpsg.datasets.pipelines.rel_randomcrop', + 'openpsg.models.relation_heads.approaches.matcher', 'openpsg.utils' +], + allow_failed_imports=False) + +dataset_type = 'PanopticSceneGraphDataset' + +# HACK: +object_classes = [ + 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', + 'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign', + 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', + 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', + 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', + 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', + 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', + 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', + 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant', + 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', + 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', + 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear', + 'hair drier', 'toothbrush', 'banner', 'blanket', 'bridge', 'cardboard', + 'counter', 'curtain', 'door-stuff', 'floor-wood', 'flower', 'fruit', + 'gravel', 'house', 'light', 'mirror-stuff', 'net', 'pillow', 'platform', + 'playingfield', 'railroad', 'river', 'road', 'roof', 'sand', 'sea', + 'shelf', 'snow', 'stairs', 'tent', 'towel', 'wall-brick', 'wall-stone', + 'wall-tile', 'wall-wood', 'water-other', 'window-blind', 'window-other', + 'tree-merged', 'fence-merged', 'ceiling-merged', 'sky-other-merged', + 'cabinet-merged', 'table-merged', 'floor-other-merged', 'pavement-merged', + 'mountain-merged', 'grass-merged', 'dirt-merged', 'paper-merged', + 'food-other-merged', 'building-other-merged', 'rock-merged', + 'wall-other-merged', 'rug-merged' +] + +predicate_classes = [ + 'over', + 'in front of', + 'beside', + 'on', + 'in', + 'attached to', + 'hanging from', + 'on back of', + 'falling off', + 'going down', + 'painted on', + 'walking on', + 'running on', + 'crossing', + 'standing on', + 'lying on', + 'sitting on', + 'flying over', + 'jumping over', + 'jumping from', + 'wearing', + 'holding', + 'carrying', + 'looking at', + 'guiding', + 'kissing', + 'eating', + 'drinking', + 'feeding', + 'biting', + 'catching', + 'picking', + 'playing with', + 'chasing', + 'climbing', + 'cleaning', + 'playing', + 'touching', + 'pushing', + 'pulling', + 'opening', + 'cooking', + 'talking to', + 'throwing', + 'slicing', + 'driving', + 'riding', + 'parked on', + 'driving on', + 'about to hit', + 'kicking', + 'swinging', + 'entering', + 'exiting', + 'enclosing', + 'leaning on', +] + +model = dict(bbox_head=dict( + num_classes=len(object_classes), + num_relations=len(predicate_classes), + object_classes=object_classes, + predicate_classes=predicate_classes, + use_mask=True, + num_query=100, +), ) + +img_norm_cfg = dict(mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + to_rgb=True) +# train_pipeline, NOTE the img_scale and the Pad's size_divisor is different +# from the default setting in mmdet. +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='LoadPanopticSceneGraphAnnotations', + with_bbox=True, + with_rel=True, + with_mask=True, + with_seg=True), + dict(type='RandomFlip', flip_ratio=0.5), + dict( + type='AutoAugment', + policies=[ + [ + dict(type='Resize', + img_scale=[(480, 1333), (512, 1333), (544, 1333), + (576, 1333), (608, 1333), (640, 1333), + (672, 1333), (704, 1333), (736, 1333), + (768, 1333), (800, 1333)], + multiscale_mode='value', + keep_ratio=True) + ], + [ + dict(type='Resize', + img_scale=[(400, 1333), (500, 1333), (600, 1333)], + multiscale_mode='value', + keep_ratio=True), + dict(type='RelRandomCrop', + crop_type='absolute_range', + crop_size=(384, 600), + allow_negative_crop=False), # no empty relations + dict(type='Resize', + img_scale=[(480, 1333), (512, 1333), (544, 1333), + (576, 1333), (608, 1333), (640, 1333), + (672, 1333), (704, 1333), (736, 1333), + (768, 1333), (800, 1333)], + multiscale_mode='value', + override=True, + keep_ratio=True) + ] + ]), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=1), + dict(type='RelsFormatBundle'), + dict(type='Collect', + keys=['img', 'gt_bboxes', 'gt_labels', 'gt_rels', 'gt_masks']) +] +# test_pipeline, NOTE the Pad's size_divisor is different from the default +# setting (size_divisor=32). While there is little effect on the performance +# whether we use the default setting or use size_divisor=1. +test_pipeline = [ + dict(type='LoadImageFromFile'), + # dict(type='LoadSceneGraphAnnotations', with_bbox=True, with_rel=True), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=1), + dict(type='ImageToTensor', keys=['img']), + # dict(type='ToTensor', keys=['gt_bboxes', 'gt_labels']), + # dict(type='ToDataContainer', fields=(dict(key='gt_bboxes'), dict(key='gt_labels'))), + dict(type='Collect', keys=['img']), + ]) +] + +evaluation = dict( + interval=1, + metric='sgdet', + relation_mode=True, + classwise=True, + iou_thrs=0.5, + detection_method='pan_seg', +) + +data = dict(samples_per_gpu=1, + workers_per_gpu=2, + train=dict(pipeline=train_pipeline), + val=dict(pipeline=test_pipeline), + test=dict(pipeline=test_pipeline)) +# optimizer +optimizer = dict( + type='AdamW', + lr=0.0001, + weight_decay=0.0001, + paramwise_cfg=dict(custom_keys={ + 'backbone': dict(lr_mult=0.1, decay_mult=1.0), + })) +optimizer_config = dict(grad_clip=dict(max_norm=0.1, norm_type=2)) + +# learning policy +lr_config = dict(policy='step', step=40) +runner = dict(type='EpochBasedRunner', max_epochs=60) + +project_name = 'psgformer' +expt_name = 'psgtr_r50_psg_0.5_scale_mask' +work_dir = f'./work_dirs/{expt_name}' +checkpoint_config = dict(interval=2, max_keep_ckpts=10) + +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook'), + dict( + type='WandbLoggerHook', + init_kwargs=dict( + project=project_name, + name=expt_name, + # config=work_dir + "/cfg.yaml" + ), + ) + ], +) + +load_from = 'work_dirs/checkpoints/detr_pan_r50.pth' diff --git a/OpenPSG/configs/psgtr/psgtr_r50_psg_inference.py b/OpenPSG/configs/psgtr/psgtr_r50_psg_inference.py new file mode 100644 index 0000000000000000000000000000000000000000..7d32a233c2690c53b40a60a69d10b6fa58d0ea7f --- /dev/null +++ b/OpenPSG/configs/psgtr/psgtr_r50_psg_inference.py @@ -0,0 +1,31 @@ +_base_ = [ + './psgtr_r50_psg.py' +] + +img_norm_cfg = dict(mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + to_rgb=True) +pipeline = [ + dict(type='LoadImageFromFile'), + dict( + type='MultiScaleFlipAug', + img_scale=(1333, 800), + flip=False, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + # NOTE: Do not change the img to DC. + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + + ], + ), +] + +data = dict( + test=dict( + pipeline=pipeline, + ), +) \ No newline at end of file diff --git a/OpenPSG/configs/vctree/panoptic_fpn_r101_fpn_1x_predcls_psg.py b/OpenPSG/configs/vctree/panoptic_fpn_r101_fpn_1x_predcls_psg.py new file mode 100644 index 0000000000000000000000000000000000000000..faabe0d659a7e1b24b2f58dda644a9a0fe8faf08 --- /dev/null +++ b/OpenPSG/configs/vctree/panoptic_fpn_r101_fpn_1x_predcls_psg.py @@ -0,0 +1,28 @@ +_base_ = './panoptic_fpn_r50_fpn_1x_predcls_psg.py' + +model = dict(backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101'))) + +# Log config +project_name = 'openpsg' +expt_name = 'vctree_panoptic_fpn_r101_fpn_1x_predcls_psg' +work_dir = f'./work_dirs/{expt_name}' + +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + dict( + type='WandbLoggerHook', + init_kwargs=dict( + project=project_name, + name=expt_name, + # config=work_dir + "/cfg.yaml" + ), + ), + ], +) + +load_from = 'work_dirs/checkpoints/panoptic_fpn_r101_fpn_1x_coco_20210820_193950-ab9157a2.pth' diff --git a/OpenPSG/configs/vctree/panoptic_fpn_r101_fpn_1x_sgdet_psg.py b/OpenPSG/configs/vctree/panoptic_fpn_r101_fpn_1x_sgdet_psg.py new file mode 100644 index 0000000000000000000000000000000000000000..cc49e1368baa36b8fcc2c14a3fb7703e51c854f2 --- /dev/null +++ b/OpenPSG/configs/vctree/panoptic_fpn_r101_fpn_1x_sgdet_psg.py @@ -0,0 +1,28 @@ +_base_ = './panoptic_fpn_r50_fpn_1x_sgdet_psg.py' + +model = dict(backbone=dict( + depth=101, + init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101'))) + +# Log config +project_name = 'openpsg' +expt_name = 'vctree_panoptic_fpn_r101_fpn_1x_sgdet_psg' +work_dir = f'./work_dirs/{expt_name}' + +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + dict( + type='WandbLoggerHook', + init_kwargs=dict( + project=project_name, + name=expt_name, + # config=work_dir + "/cfg.yaml" + ), + ), + ], +) + +load_from = 'work_dirs/checkpoints/panoptic_fpn_r101_fpn_1x_coco_20210820_193950-ab9157a2.pth' diff --git a/OpenPSG/configs/vctree/panoptic_fpn_r50_fpn_1x_predcls_psg.py b/OpenPSG/configs/vctree/panoptic_fpn_r50_fpn_1x_predcls_psg.py new file mode 100644 index 0000000000000000000000000000000000000000..e78db15d48d404634713181231bb498ed27b936b --- /dev/null +++ b/OpenPSG/configs/vctree/panoptic_fpn_r50_fpn_1x_predcls_psg.py @@ -0,0 +1,43 @@ +_base_ = [ + '../motifs/panoptic_fpn_r50_fpn_1x_predcls_psg.py', +] + +model = dict(relation_head=dict( + type='VCTreeHead', + head_config=dict( + # NOTE: Evaluation type + use_gt_box=True, + use_gt_label=True, + ), +)) + +evaluation = dict(interval=1, + metric='predcls', + relation_mode=True, + classwise=True) + +# Change batch size and learning rate +data = dict(samples_per_gpu=16, + workers_per_gpu=0) # FIXME: Is this the problem? +# optimizer = dict(lr=0.001) + +# Log config +project_name = 'openpsg' +expt_name = 'vctree_panoptic_fpn_r50_fpn_1x_predcls_psg' +work_dir = f'./work_dirs/{expt_name}' + +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + dict( + type='WandbLoggerHook', + init_kwargs=dict( + project=project_name, + name=expt_name, + # config=work_dir + "/cfg.yaml" + ), + ), + ], +) diff --git a/OpenPSG/configs/vctree/panoptic_fpn_r50_fpn_1x_sgdet_psg.py b/OpenPSG/configs/vctree/panoptic_fpn_r50_fpn_1x_sgdet_psg.py new file mode 100644 index 0000000000000000000000000000000000000000..d0f05d87f47ebc28920183e317aa26d0abb15026 --- /dev/null +++ b/OpenPSG/configs/vctree/panoptic_fpn_r50_fpn_1x_sgdet_psg.py @@ -0,0 +1,49 @@ +_base_ = [ + '../motifs/panoptic_fpn_r50_fpn_1x_predcls_psg.py', +] + +model = dict( + relation_head=dict( + type='VCTreeHead', + head_config=dict( + # NOTE: Evaluation type + use_gt_box=False, + use_gt_label=False, + ), + ), + roi_head=dict(bbox_head=dict(type='SceneGraphBBoxHead'), ), +) + +evaluation = dict(interval=1, + metric='sgdet', + relation_mode=True, + classwise=True, + iou_thrs=0.5, + detection_method='pan_seg') + +# Change batch size and learning rate +data = dict(samples_per_gpu=16, + # workers_per_gpu=2 + ) +# optimizer = dict(lr=0.003) + +# Log config +project_name = 'openpsg' +expt_name = 'vctree_panoptic_fpn_r50_fpn_1x_sgdet_psg' +work_dir = f'./work_dirs/{expt_name}' + +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + dict( + type='WandbLoggerHook', + init_kwargs=dict( + project=project_name, + name=expt_name, + # config=work_dir + "/cfg.yaml" + ), + ), + ], +) diff --git a/README.md b/README.md index 24242ef29f59b4de0e9631b0475d05e42dc73a05..56c55de1fab90a8642f9378c9dfef888302d9530 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,8 @@ --- title: OpenPSG -emoji: 🐠 -colorFrom: green -colorTo: gray +emoji: 🖼️🏙️🌄🌉 +colorFrom: yellow +colorTo: blue sdk: gradio sdk_version: 3.1.4 app_file: app.py diff --git a/app.py b/app.py index f55e9e6c34fad83380e6d0562df75076cdd3883e..a471833d791aa71a92bad57908e9e8dc7e703dd7 100644 --- a/app.py +++ b/app.py @@ -1,15 +1,135 @@ -import numpy as np +#!/usr/bin/env python + +from __future__ import annotations + +import argparse +import os +import pathlib +import subprocess +import tarfile + +if os.getenv('SYSTEM') == 'spaces': + import mim + + mim.uninstall('mmcv-full', confirm_yes=True) + mim.install('mmcv-full==1.5.2', is_yes=True) + + subprocess.call('pip uninstall -y opencv-python'.split()) + subprocess.call('pip uninstall -y opencv-python-headless'.split()) + subprocess.call('pip install opencv-python-headless==4.5.5.64'.split()) + +import cv2 import gradio as gr +import numpy as np + +from mmdet.apis import init_detector, inference_detector +from utils import show_result +import mmcv +from mmcv import Config +import os.path as osp + +DESCRIPTION = '''# OpenPSG + +This is an official demo for [OpenPSG](https://github.com/Jingkang50/OpenPSG). +overview +''' +FOOTER = 'visitor badge' + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser() + parser.add_argument('--device', type=str, default='cpu') + parser.add_argument('--theme', type=str) + parser.add_argument('--share', action='store_true') + parser.add_argument('--port', type=int) + parser.add_argument('--disable-queue', + dest='enable_queue', + action='store_false') + return parser.parse_args() + + +def update_input_image(image: np.ndarray) -> dict: + if image is None: + return gr.Image.update(value=None) + scale = 1500 / max(image.shape[:2]) + if scale < 1: + image = cv2.resize(image, None, fx=scale, fy=scale) + return gr.Image.update(value=image) + + +def set_example_image(example: list) -> dict: + return gr.Image.update(value=example[0]) + + +def infer(model, input_image, num_rel): + result = inference_detector(model, input_image) + return show_result(input_image, + result, + is_one_stage=True, + num_rel=num_rel, + show=True + ) + + +def main(): + args = parse_args() + + model_ckt ='OpenPSG/checkpoints/epoch_60.pth' + cfg = Config.fromfile('OpenPSG/configs/psgtr/psgtr_r50_psg_inference.py') + + model = init_detector(cfg, model_ckt, device=args.device) + + with gr.Blocks(theme=args.theme, css='style.css') as demo: + gr.Markdown(DESCRIPTION) + + with gr.Row(): + with gr.Column(): + with gr.Row(): + input_image = gr.Image(label='Input Image', type='numpy') + with gr.Group(): + with gr.Row(): + num_rel = gr.Slider( + 5, + 100, + step=5, + value=20, + label='Number of Relations') + with gr.Row(): + run_button = gr.Button(value='Run') + # prediction_results = gr.Variable() + with gr.Column(): + with gr.Row(): + # visualization = gr.Image(label='Result', type='numpy') + result = gr.Gallery(label='Result', type='numpy') + + with gr.Row(): + paths = sorted(pathlib.Path('images').rglob('*.jpg')) + example_images = gr.Dataset(components=[input_image], + samples=[[path.as_posix()] + for path in paths]) + + gr.Markdown(FOOTER) + + input_image.change(fn=update_input_image, + inputs=input_image, + outputs=input_image) + + run_button.click(fn=infer, + inputs=[ + model, input_image + ], + outputs=result) + + example_images.click(fn=set_example_image, + inputs=example_images, + outputs=input_image) + + demo.launch( + enable_queue=args.enable_queue, + server_port=args.port, + share=args.share, + ) + -def sepia(input_img): - sepia_filter = np.array([ - [0.393, 0.769, 0.189], - [0.349, 0.686, 0.168], - [0.272, 0.534, 0.131] - ]) - sepia_img = input_img.dot(sepia_filter.T) - sepia_img /= sepia_img.max() - return sepia_img - -demo = gr.Interface(sepia, gr.Image(shape=(200, 200)), "image") -demo.launch(share=True) \ No newline at end of file +if __name__ == '__main__': + main() diff --git a/fake_gan.py b/fake_gan.py new file mode 100644 index 0000000000000000000000000000000000000000..723fa422afdd4a4323fff964b7b48d68315a76e5 --- /dev/null +++ b/fake_gan.py @@ -0,0 +1,56 @@ +# another demo +# https://huggingface.co/spaces/dalle-mini/dalle-mini/blob/21944e2a8508568387951fc66a30e90f1d58819d/app/gradio/app.py + +# This demo needs to be run from the repo folder. +# python demo/fake_gan/run.py +import os +import random +import time + +import gradio as gr + + +def fake_gan(count, *args): + time.sleep(1) + images = [ + random.choice( + [ + "https://images.unsplash.com/photo-1507003211169-0a1dd7228f2d?ixlib=rb-1.2.1&ixid=MnwxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8&auto=format&fit=crop&w=387&q=80", + "https://images.unsplash.com/photo-1554151228-14d9def656e4?ixlib=rb-1.2.1&ixid=MnwxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8&auto=format&fit=crop&w=386&q=80", + "https://images.unsplash.com/photo-1542909168-82c3e7fdca5c?ixlib=rb-1.2.1&ixid=MnwxMjA3fDB8MHxzZWFyY2h8MXx8aHVtYW4lMjBmYWNlfGVufDB8fDB8fA%3D%3D&w=1000&q=80", + "https://images.unsplash.com/photo-1546456073-92b9f0a8d413?ixlib=rb-1.2.1&ixid=MnwxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8&auto=format&fit=crop&w=387&q=80", + "https://images.unsplash.com/photo-1601412436009-d964bd02edbc?ixlib=rb-1.2.1&ixid=MnwxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8&auto=format&fit=crop&w=464&q=80", + ] + ) + for _ in range(int(count)) + ] + return images + + +cheetah = os.path.join(os.path.dirname(__file__), "files/cheetah1.jpg") + +demo = gr.Interface( + fn=fake_gan, + inputs=[ + gr.Number(label="Generation Count"), + gr.Image(label="Initial Image (optional)"), + gr.Slider(0, 50, 25, label="TV_scale (for smoothness)"), + gr.Slider(0, 50, 25, label="Range_Scale (out of range RBG)"), + gr.Number(label="Seed"), + gr.Number(label="Respacing"), + ], + outputs=gr.Gallery(label="Generated Images"), + title="FD-GAN", + description="This is a fake demo of a GAN. In reality, the images are randomly chosen from Unsplash.", + examples=[ + [2, cheetah, 12, None, None, None], + [1, cheetah, 2, None, None, None], + [4, cheetah, 42, None, None, None], + [5, cheetah, 23, None, None, None], + [4, cheetah, 11, None, None, None], + [3, cheetah, 1, None, None, None], + ], +) + +if __name__ == "__main__": + demo.launch() \ No newline at end of file diff --git a/images/cooking.jpg b/images/cooking.jpg new file mode 100644 index 0000000000000000000000000000000000000000..5e6a0026995854873875ca7831b0b88e473db837 Binary files /dev/null and b/images/cooking.jpg differ diff --git a/images/forrest-gump.jpg b/images/forrest-gump.jpg new file mode 100644 index 0000000000000000000000000000000000000000..1c5b19b774bc3b9f9292aa870f7897c69aae89f4 Binary files /dev/null and b/images/forrest-gump.jpg differ diff --git a/images/friends.jpg b/images/friends.jpg new file mode 100644 index 0000000000000000000000000000000000000000..92cb34b3671d620c7034331d3ff67300c686b74a Binary files /dev/null and b/images/friends.jpg differ diff --git a/images/mbappe.jpg b/images/mbappe.jpg new file mode 100644 index 0000000000000000000000000000000000000000..b5f8c42f389696ad2a176c0282c5f705f01e0ca9 Binary files /dev/null and b/images/mbappe.jpg differ diff --git a/images/messi.jpg b/images/messi.jpg new file mode 100644 index 0000000000000000000000000000000000000000..bf3e1b4141b376c85df73f4c14c4cac9be858ff3 Binary files /dev/null and b/images/messi.jpg differ diff --git a/images/neymar-jr-angers-x-psg-160121.jpg b/images/neymar-jr-angers-x-psg-160121.jpg new file mode 100644 index 0000000000000000000000000000000000000000..2e048c6998a15fa1c2af034ac259522b23d34348 Binary files /dev/null and b/images/neymar-jr-angers-x-psg-160121.jpg differ diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..dac17be281ce349af3d16e22fda95372e2930116 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,7 @@ +mmcv-full==1.5.2 +mmdet==2.25.0 +numpy==1.22.4 +opencv-python-headless==4.5.5.64 +openmim==0.1.5 +torch==1.11.0 +torchvision==0.12.0 diff --git a/style.css b/style.css new file mode 100644 index 0000000000000000000000000000000000000000..22ad0be91ed35841bc456be4a0044474affc9a17 --- /dev/null +++ b/style.css @@ -0,0 +1,16 @@ +h1 { + text-align: center; +} +#input-image { + max-height: 300px; +} +#label-image { + height: 300px; +} +#result-image { + height: 300px; +} +img#visitor-badge { + display: block; + margin: auto; +} diff --git a/utils.py b/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..0f966daf6533811ab2a96bcdc84c2cf9e7360ad8 --- /dev/null +++ b/utils.py @@ -0,0 +1,288 @@ +from typing import Tuple +import os.path as osp +import PIL +import mmcv +import numpy as np +from detectron2.utils.colormap import colormap +from detectron2.utils.visualizer import VisImage, Visualizer +from mmdet.datasets.coco_panoptic import INSTANCE_OFFSET + +CLASSES = [ + 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', + 'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign', + 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', + 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', + 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', + 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', + 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', + 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', + 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant', + 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', + 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', + 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear', + 'hair drier', 'toothbrush', 'banner', 'blanket', 'bridge', 'cardboard', + 'counter', 'curtain', 'door-stuff', 'floor-wood', 'flower', 'fruit', + 'gravel', 'house', 'light', 'mirror-stuff', 'net', 'pillow', 'platform', + 'playingfield', 'railroad', 'river', 'road', 'roof', 'sand', 'sea', + 'shelf', 'snow', 'stairs', 'tent', 'towel', 'wall-brick', 'wall-stone', + 'wall-tile', 'wall-wood', 'water-other', 'window-blind', 'window-other', + 'tree-merged', 'fence-merged', 'ceiling-merged', 'sky-other-merged', + 'cabinet-merged', 'table-merged', 'floor-other-merged', 'pavement-merged', + 'mountain-merged', 'grass-merged', 'dirt-merged', 'paper-merged', + 'food-other-merged', 'building-other-merged', 'rock-merged', + 'wall-other-merged', 'rug-merged', 'background' +] + +PREDICATES = [ + 'over', + 'in front of', + 'beside', + 'on', + 'in', + 'attached to', + 'hanging from', + 'on back of', + 'falling off', + 'going down', + 'painted on', + 'walking on', + 'running on', + 'crossing', + 'standing on', + 'lying on', + 'sitting on', + 'flying over', + 'jumping over', + 'jumping from', + 'wearing', + 'holding', + 'carrying', + 'looking at', + 'guiding', + 'kissing', + 'eating', + 'drinking', + 'feeding', + 'biting', + 'catching', + 'picking', + 'playing with', + 'chasing', + 'climbing', + 'cleaning', + 'playing', + 'touching', + 'pushing', + 'pulling', + 'opening', + 'cooking', + 'talking to', + 'throwing', + 'slicing', + 'driving', + 'riding', + 'parked on', + 'driving on', + 'about to hit', + 'kicking', + 'swinging', + 'entering', + 'exiting', + 'enclosing', + 'leaning on', +] + + +def get_colormap(num_colors: int): + return (np.resize(colormap(), (num_colors, 3))).tolist() + + +def draw_text( + viz_img: VisImage = None, + text: str = None, + x: float = None, + y: float = None, + color: Tuple[float, float, float] = [0, 0, 0], + size: float = 10, + padding: float = 5, + box_color: str = 'black', + font: str = None, +) -> float: + text_obj = viz_img.ax.text( + x, + y, + text, + size=size, + # family="sans-serif", + bbox={ + 'facecolor': box_color, + 'alpha': 0.8, + 'pad': padding, + 'edgecolor': 'none', + }, + verticalalignment='top', + horizontalalignment='left', + color=color, + zorder=10, + rotation=0, + ) + viz_img.get_image() + text_dims = text_obj.get_bbox_patch().get_extents() + + return text_dims.width + + +def show_result(img, + result, + is_one_stage, + num_rel=20, + show=False, + out_dir=None, + out_file=None): + # Load image + img = mmcv.imread(img) + img = img.copy() # (H, W, 3) + img_h, img_w = img.shape[:-1] + + # Decrease contrast + img = PIL.Image.fromarray(img) + converter = PIL.ImageEnhance.Color(img) + img = converter.enhance(0.01) + if out_file is not None: + mmcv.imwrite(np.asarray(img), 'bw'+out_file) + + # Draw masks + pan_results = result.pan_results + + ids = np.unique(pan_results)[::-1] + num_classes = 133 + legal_indices = (ids != num_classes) # for VOID label + ids = ids[legal_indices] + + # Get predicted labels + labels = np.array([id % INSTANCE_OFFSET for id in ids], dtype=np.int64) + labels = [CLASSES[l] for l in labels] + + #For psgtr + rel_obj_labels = result.labels + rel_obj_labels = [CLASSES[l - 1] for l in rel_obj_labels] + + # (N_m, H, W) + segms = pan_results[None] == ids[:, None, None] + # Resize predicted masks + segms = [ + mmcv.image.imresize(m.astype(float), (img_w, img_h)) for m in segms + ] + # One stage segmentation + masks = result.masks + + # Choose colors for each instance in coco + colormap_coco = get_colormap(len(masks)) if is_one_stage else get_colormap(len(segms)) + colormap_coco = (np.array(colormap_coco) / 255).tolist() + + # Viualize masks + viz = Visualizer(img) + viz.overlay_instances( + labels=rel_obj_labels if is_one_stage else labels, + masks=masks if is_one_stage else segms, + assigned_colors=colormap_coco, + ) + viz_img = viz.get_output().get_image() + if out_file is not None: + mmcv.imwrite(viz_img, out_file) + + # Draw relations + + # Filter out relations + n_rel_topk = num_rel + # Exclude background class + rel_dists = result.rel_dists[:, 1:] + # rel_dists = result.rel_dists + rel_scores = rel_dists.max(1) + # rel_scores = result.triplet_scores + # Extract relations with top scores + rel_topk_idx = np.argpartition(rel_scores, -n_rel_topk)[-n_rel_topk:] + rel_labels_topk = rel_dists[rel_topk_idx].argmax(1) + rel_pair_idxes_topk = result.rel_pair_idxes[rel_topk_idx] + relations = np.concatenate( + [rel_pair_idxes_topk, rel_labels_topk[..., None]], axis=1) + n_rels = len(relations) + + top_padding = 20 + bottom_padding = 20 + left_padding = 20 + text_size = 10 + text_padding = 5 + text_height = text_size + 2 * text_padding + row_padding = 10 + height = (top_padding + bottom_padding + n_rels * + (text_height + row_padding) - row_padding) + width = img_w + curr_x = left_padding + curr_y = top_padding + + # # Adjust colormaps + # colormap_coco = [adjust_text_color(c, viz) for c in colormap_coco] + viz_graph = VisImage(np.full((height, width, 3), 255)) + + all_rel_vis = [] + + for i, r in enumerate(relations): + s_idx, o_idx, rel_id = r + s_label = rel_obj_labels[s_idx] + o_label = rel_obj_labels[o_idx] + rel_label = PREDICATES[rel_id] + viz = Visualizer(img) + viz.overlay_instances( + labels=[s_label, o_label], + masks=[masks[s_idx], masks[o_idx]], + assigned_colors=[colormap_coco[s_idx], colormap_coco[o_idx]], + ) + viz_masked_img = viz.get_output().get_image() + + viz_graph = VisImage(np.full((40, width, 3), 255)) + curr_x = 2 + curr_y = 2 + text_size = 25 + text_padding = 20 + font = 36 + text_width = draw_text( + viz_img=viz_graph, + text=s_label, + x=curr_x, + y=curr_y, + color=colormap_coco[s_idx], + size=text_size, + padding=text_padding, + font=font, + ) + curr_x += text_width + # Draw relation text + text_width = draw_text( + viz_img=viz_graph, + text=rel_label, + x=curr_x, + y=curr_y, + size=text_size, + padding=text_padding, + box_color='gainsboro', + font=font, + ) + curr_x += text_width + + # Draw object text + text_width = draw_text( + viz_img=viz_graph, + text=o_label, + x=curr_x, + y=curr_y, + color=colormap_coco[o_idx], + size=text_size, + padding=text_padding, + font=font, + ) + output_viz_graph = np.vstack([viz_masked_img, viz_graph.get_image()]) + if show: + all_rel_vis.append(output_viz_graph) + + return all_rel_vis \ No newline at end of file