diff --git a/OpenPSG/checkpoints/epoch_60.pth b/OpenPSG/checkpoints/epoch_60.pth
new file mode 100644
index 0000000000000000000000000000000000000000..9ca2d6cd2ca7532a08cd84df438dbdcece5049c5
--- /dev/null
+++ b/OpenPSG/checkpoints/epoch_60.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1c4ddcbda74686568b7e6b8145f7f33030407e27e390c37c23206f95c51829ed
+size 531751994
diff --git a/OpenPSG/configs/_base_/custom_runtime.py b/OpenPSG/configs/_base_/custom_runtime.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c0898bafac0870b691dcfc1467a618973646e7f
--- /dev/null
+++ b/OpenPSG/configs/_base_/custom_runtime.py
@@ -0,0 +1,17 @@
+checkpoint_config = dict(interval=1, max_keep_ckpts=1)
+# yapf:disable
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+# yapf:enable
+custom_hooks = [dict(type='NumClassCheckHook')]
+
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+load_from = None
+resume_from = None
+
+workflow = [('train', 1), ('val', 1)]
diff --git a/OpenPSG/configs/_base_/datasets/psg.py b/OpenPSG/configs/_base_/datasets/psg.py
new file mode 100644
index 0000000000000000000000000000000000000000..052dcd787578900f875b7f9d43729a188a4d2aca
--- /dev/null
+++ b/OpenPSG/configs/_base_/datasets/psg.py
@@ -0,0 +1,93 @@
+# dataset settings
+dataset_type = 'PanopticSceneGraphDataset'
+ann_file = './data/psg/psg.json'
+coco_root = 'data/coco'
+
+img_norm_cfg = dict(mean=[123.675, 116.28, 103.53],
+                    std=[58.395, 57.12, 57.375],
+                    to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='LoadPanopticSceneGraphAnnotations',
+        with_bbox=True,
+        with_rel=True,
+        with_mask=True,
+        with_seg=True,
+    ),
+    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='SegRescale', scale_factor=1 / 4),
+    dict(type='SceneGraphFormatBundle'),
+    dict(
+        type='Collect',
+        keys=[
+            'img',
+            'gt_bboxes',
+            'gt_labels',
+            'gt_rels',
+            'gt_relmaps',
+            'gt_masks',
+            'gt_semantic_seg',
+        ],
+    ),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    # Since the forward process may need gt info, annos must be loaded.
+    dict(type='LoadPanopticSceneGraphAnnotations',
+         with_bbox=True,
+         with_rel=True),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            # NOTE: Do not change the img to DC.
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='ToTensor', keys=['gt_bboxes', 'gt_labels']),
+            dict(
+                type='ToDataContainer',
+                fields=(dict(key='gt_bboxes'), dict(key='gt_labels')),
+            ),
+            dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+        ],
+    ),
+]
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        ann_file=ann_file,
+        img_prefix=coco_root,
+        seg_prefix=coco_root,
+        pipeline=train_pipeline,
+        split='train',
+        all_bboxes=True,
+    ),
+    val=dict(
+        type=dataset_type,
+        ann_file=ann_file,
+        img_prefix=coco_root,
+        seg_prefix=coco_root,
+        pipeline=test_pipeline,
+        split='test',
+        all_bboxes=True,
+    ),
+    test=dict(
+        type=dataset_type,
+        ann_file=ann_file,
+        img_prefix=coco_root,
+        seg_prefix=coco_root,
+        pipeline=test_pipeline,
+        split='test',
+        all_bboxes=True,
+    ),
+)
diff --git a/OpenPSG/configs/_base_/datasets/psg_panoptic.py b/OpenPSG/configs/_base_/datasets/psg_panoptic.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e5ee5f27af854da81cc9b936a47d3ed7721502f
--- /dev/null
+++ b/OpenPSG/configs/_base_/datasets/psg_panoptic.py
@@ -0,0 +1,72 @@
+# dataset settings
+dataset_type = 'PanopticSceneGraphDataset'
+ann_file = './data/psg/psg.json'
+coco_root = './data/coco'
+
+img_norm_cfg = dict(mean=[123.675, 116.28, 103.53],
+                    std=[58.395, 57.12, 57.375],
+                    to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='LoadPanopticSceneGraphAnnotations',
+        with_bbox=True,
+        with_mask=True,
+        with_seg=True,
+    ),
+    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='SegRescale', scale_factor=1 / 4),
+    dict(type='DefaultFormatBundle'),
+    dict(
+        type='Collect',
+        keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg'],
+    ),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ],
+    ),
+]
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        ann_file=ann_file,
+        img_prefix=coco_root,
+        seg_prefix=coco_root,
+        pipeline=train_pipeline,
+        split='train',
+    ),
+    val=dict(
+        type=dataset_type,
+        ann_file=ann_file,
+        img_prefix=coco_root,
+        seg_prefix=coco_root,
+        pipeline=test_pipeline,
+        split='test',
+    ),
+    test=dict(
+        type=dataset_type,
+        ann_file=ann_file,
+        img_prefix=coco_root,
+        seg_prefix=coco_root,
+        pipeline=test_pipeline,
+        split='test',
+    ),
+)
+evaluation = dict(interval=1, metric='PQ')
diff --git a/OpenPSG/configs/_base_/datasets/vg_detection.py b/OpenPSG/configs/_base_/datasets/vg_detection.py
new file mode 100644
index 0000000000000000000000000000000000000000..d826ecca5ea9c9bfbaf08366b5b2a468c908363b
--- /dev/null
+++ b/OpenPSG/configs/_base_/datasets/vg_detection.py
@@ -0,0 +1,56 @@
+# dataset settings
+custom_imports = dict(imports=[
+    'openpsg.datasets',
+    'openpsg.datasets.pipelines',
+],
+                      allow_failed_imports=False)
+
+dataset_type = 'SceneGraphDataset'
+ann_file = 'data/vg/data_openpsg.json'
+img_dir = 'data/vg/VG_100K'
+
+img_norm_cfg = dict(mean=[123.675, 116.28, 103.53],
+                    std=[58.395, 57.12, 57.375],
+                    to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadSceneGraphAnnotations', with_bbox=True),
+    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='MultiScaleFlipAug',
+         img_scale=(1333, 800),
+         flip=False,
+         transforms=[
+             dict(type='Resize', keep_ratio=True),
+             dict(type='RandomFlip'),
+             dict(type='Normalize', **img_norm_cfg),
+             dict(type='Pad', size_divisor=32),
+             dict(type='ImageToTensor', keys=['img']),
+             dict(type='Collect', keys=['img']),
+         ])
+]
+data = dict(samples_per_gpu=2,
+            workers_per_gpu=2,
+            train=dict(type=dataset_type,
+                       ann_file=ann_file,
+                       img_prefix=img_dir,
+                       pipeline=train_pipeline,
+                       split='train'),
+            val=dict(type=dataset_type,
+                     ann_file=ann_file,
+                     img_prefix=img_dir,
+                     pipeline=test_pipeline,
+                     split='test'),
+            test=dict(type=dataset_type,
+                      ann_file=ann_file,
+                      img_prefix=img_dir,
+                      pipeline=test_pipeline,
+                      split='test'))
+evaluation = dict(interval=1, metric='bbox')
diff --git a/OpenPSG/configs/_base_/datasets/vg_sg.py b/OpenPSG/configs/_base_/datasets/vg_sg.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f555ac70bc04c85cbeb9099fd792114ee2ed9a9
--- /dev/null
+++ b/OpenPSG/configs/_base_/datasets/vg_sg.py
@@ -0,0 +1,57 @@
+# dataset settings
+dataset_type = 'SceneGraphDataset'
+ann_file = '/mnt/ssd/gzj/data/VisualGenome/data_openpsg.json'
+img_dir = '/mnt/ssd/gzj/data/VisualGenome/VG_100K'
+
+img_norm_cfg = dict(mean=[123.675, 116.28, 103.53],
+                    std=[58.395, 57.12, 57.375],
+                    to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadSceneGraphAnnotations', with_bbox=True, with_rel=True),
+    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='SceneGraphFormatBundle'),
+    dict(type='Collect',
+         keys=['img', 'gt_bboxes', 'gt_labels', 'gt_rels', 'gt_relmaps']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    # Since the forward process may need gt info, annos must be loaded.
+    dict(type='LoadSceneGraphAnnotations', with_bbox=True, with_rel=True),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            # NOTE: Do not change the img to DC.
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='ToTensor', keys=['gt_bboxes', 'gt_labels']),
+            dict(type='ToDataContainer',
+                 fields=(dict(key='gt_bboxes'), dict(key='gt_labels'))),
+            dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+        ])
+]
+data = dict(samples_per_gpu=2,
+            workers_per_gpu=2,
+            train=dict(type=dataset_type,
+                       ann_file=ann_file,
+                       img_prefix=img_dir,
+                       pipeline=train_pipeline,
+                       split='train'),
+            val=dict(type=dataset_type,
+                     ann_file=ann_file,
+                     img_prefix=img_dir,
+                     pipeline=test_pipeline,
+                     split='test'),
+            test=dict(type=dataset_type,
+                      ann_file=ann_file,
+                      img_prefix=img_dir,
+                      pipeline=test_pipeline,
+                      split='test'))
diff --git a/OpenPSG/configs/_base_/models/detr4seg_r101.py b/OpenPSG/configs/_base_/models/detr4seg_r101.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c366f686fe6b2467ec29613cb9f95a229d038cc
--- /dev/null
+++ b/OpenPSG/configs/_base_/models/detr4seg_r101.py
@@ -0,0 +1,64 @@
+model = dict(
+    type='DETR4seg',
+    backbone=dict(type='ResNet',
+                  depth=101,
+                  num_stages=4,
+                  out_indices=(0, 1, 2, 3),
+                  frozen_stages=1,
+                  norm_cfg=dict(type='BN', requires_grad=False),
+                  norm_eval=True,
+                  style='pytorch',
+                  init_cfg=dict(type='Pretrained',
+                                checkpoint='torchvision://resnet101')),
+    bbox_head=dict(type='detr4segHead',
+                   num_classes=80,
+                   in_channels=2048,
+                   transformer=dict(
+                       type='Transformer',
+                       encoder=dict(type='DetrTransformerEncoder',
+                                    num_layers=6,
+                                    transformerlayers=dict(
+                                        type='BaseTransformerLayer',
+                                        attn_cfgs=[
+                                            dict(type='MultiheadAttention',
+                                                 embed_dims=256,
+                                                 num_heads=8,
+                                                 dropout=0.1)
+                                        ],
+                                        feedforward_channels=2048,
+                                        ffn_dropout=0.1,
+                                        operation_order=('self_attn', 'norm',
+                                                         'ffn', 'norm'))),
+                       decoder=dict(
+                           type='DetrTransformerDecoder',
+                           return_intermediate=True,
+                           num_layers=6,
+                           transformerlayers=dict(
+                               type='DetrTransformerDecoderLayer',
+                               attn_cfgs=dict(type='MultiheadAttention',
+                                              embed_dims=256,
+                                              num_heads=8,
+                                              dropout=0.1),
+                               feedforward_channels=2048,
+                               ffn_dropout=0.1,
+                               operation_order=('self_attn', 'norm',
+                                                'cross_attn', 'norm', 'ffn',
+                                                'norm')),
+                       )),
+                   positional_encoding=dict(type='SinePositionalEncoding',
+                                            num_feats=128,
+                                            normalize=True),
+                   loss_cls=dict(type='CrossEntropyLoss',
+                                 use_sigmoid=False,
+                                 loss_weight=1.0,
+                                 class_weight=1.0),
+                   loss_bbox=dict(type='L1Loss', loss_weight=5.0),
+                   loss_iou=dict(type='GIoULoss', loss_weight=2.0),
+                   dice_loss=dict(type='DiceLoss', loss_weight=1.0)),
+    # training and testing settings
+    train_cfg=dict(assigner=dict(
+        type='HungarianAssigner',
+        cls_cost=dict(type='ClassificationCost', weight=1.),
+        reg_cost=dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'),
+        iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0))),
+    test_cfg=dict(max_per_img=100))
diff --git a/OpenPSG/configs/_base_/models/detr4seg_r101_psg.py b/OpenPSG/configs/_base_/models/detr4seg_r101_psg.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d21e75bc4fd8b693daeaa488a613feb052914fe
--- /dev/null
+++ b/OpenPSG/configs/_base_/models/detr4seg_r101_psg.py
@@ -0,0 +1,137 @@
+_base_ = [
+    '../_base_/models/detr4seg_r101.py', '../_base_/datasets/psg.py',
+    '../_base_/custom_runtime.py'
+]
+
+custom_imports = dict(imports=[
+    'openpsg.models.frameworks.detr4seg',
+    'openpsg.models.relation_heads.detr4seg_head', 'openpsg.datasets',
+    'openpsg.datasets.pipelines.loading',
+    'openpsg.datasets.pipelines.rel_randomcrop',
+    'openpsg.models.relation_heads.approaches.matcher',
+    'openpsg.models.losses.seg_losses'
+],
+                      allow_failed_imports=False)
+
+object_classes = [
+    'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train',
+    'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign',
+    'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
+    'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag',
+    'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite',
+    'baseball bat', 'baseball glove', 'skateboard', 'surfboard',
+    'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon',
+    'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot',
+    'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant',
+    'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote',
+    'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink',
+    'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear',
+    'hair drier', 'toothbrush', 'banner', 'blanket', 'bridge', 'cardboard',
+    'counter', 'curtain', 'door-stuff', 'floor-wood', 'flower', 'fruit',
+    'gravel', 'house', 'light', 'mirror-stuff', 'net', 'pillow', 'platform',
+    'playingfield', 'railroad', 'river', 'road', 'roof', 'sand', 'sea',
+    'shelf', 'snow', 'stairs', 'tent', 'towel', 'wall-brick', 'wall-stone',
+    'wall-tile', 'wall-wood', 'water-other', 'window-blind', 'window-other',
+    'tree-merged', 'fence-merged', 'ceiling-merged', 'sky-other-merged',
+    'cabinet-merged', 'table-merged', 'floor-other-merged', 'pavement-merged',
+    'mountain-merged', 'grass-merged', 'dirt-merged', 'paper-merged',
+    'food-other-merged', 'building-other-merged', 'rock-merged',
+    'wall-other-merged', 'rug-merged'
+]
+
+model = dict(bbox_head=dict(
+    num_classes=len(object_classes),
+    object_classes=object_classes,
+))
+
+img_norm_cfg = dict(mean=[123.675, 116.28, 103.53],
+                    std=[58.395, 57.12, 57.375],
+                    to_rgb=True)
+# train_pipeline, NOTE the img_scale and the Pad's size_divisor is different
+# from the default setting in mmdet.
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadSceneGraphAnnotations', with_bbox=True, with_rel=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(
+        type='AutoAugment',
+        policies=[
+            [
+                dict(type='Resize',
+                     img_scale=[(480, 1333), (512, 1333), (544, 1333),
+                                (576, 1333), (608, 1333), (640, 1333),
+                                (672, 1333), (704, 1333), (736, 1333),
+                                (768, 1333), (800, 1333)],
+                     multiscale_mode='value',
+                     keep_ratio=True)
+            ],
+            [
+                dict(type='Resize',
+                     img_scale=[(400, 1333), (500, 1333), (600, 1333)],
+                     multiscale_mode='value',
+                     keep_ratio=True),
+                dict(type='RandomCrop',
+                     crop_type='absolute_range',
+                     crop_size=(384, 600),
+                     allow_negative_crop=False),  # no empty relations
+                dict(type='Resize',
+                     img_scale=[(480, 1333), (512, 1333), (544, 1333),
+                                (576, 1333), (608, 1333), (640, 1333),
+                                (672, 1333), (704, 1333), (736, 1333),
+                                (768, 1333), (800, 1333)],
+                     multiscale_mode='value',
+                     override=True,
+                     keep_ratio=True)
+            ]
+        ]),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=1),
+    dict(type='RelsFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels'])
+]
+# test_pipeline, NOTE the Pad's size_divisor is different from the default
+# setting (size_divisor=32). While there is little effect on the performance
+# whether we use the default setting or use size_divisor=1.
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='MultiScaleFlipAug',
+         img_scale=(1333, 800),
+         flip=False,
+         transforms=[
+             dict(type='Resize', keep_ratio=True),
+             dict(type='RandomFlip'),
+             dict(type='Normalize', **img_norm_cfg),
+             dict(type='Pad', size_divisor=1),
+             dict(type='ImageToTensor', keys=['img']),
+             dict(type='Collect', keys=['img'])
+         ])
+]
+data = dict(samples_per_gpu=2,
+            workers_per_gpu=2,
+            train=dict(pipeline=train_pipeline),
+            val=dict(pipeline=test_pipeline),
+            test=dict(pipeline=test_pipeline))
+# optimizer
+optimizer = dict(
+    type='AdamW',
+    lr=0.0001,
+    weight_decay=0.0001,
+    paramwise_cfg=dict(
+        custom_keys={'backbone': dict(lr_mult=0.1, decay_mult=1.0)}))
+optimizer_config = dict(grad_clip=dict(max_norm=0.1, norm_type=2))
+
+# learning policy
+lr_config = dict(policy='step', step=110)
+runner = dict(type='EpochBasedRunner', max_epochs=150)
+
+project_name = 'detr4seg'
+expt_name = 'detr4seg_r101_coco'
+work_dir = f'./work_dirs/{expt_name}'
+
+log_config = dict(
+    interval=50,
+    hooks=[dict(type='TextLoggerHook'),
+           dict(type='TensorboardLoggerHook')],
+)
+
+load_from = '/mnt/ssd/gzj/test/OpenPSG/detr_r50_fb_origin.pth'
diff --git a/OpenPSG/configs/_base_/models/detr4seg_r50.py b/OpenPSG/configs/_base_/models/detr4seg_r50.py
new file mode 100644
index 0000000000000000000000000000000000000000..326bc62336154ca94211a820406fb26025a9c544
--- /dev/null
+++ b/OpenPSG/configs/_base_/models/detr4seg_r50.py
@@ -0,0 +1,65 @@
+model = dict(
+    type='DETR4seg',
+    backbone=dict(type='ResNet',
+                  depth=50,
+                  num_stages=4,
+                  out_indices=(0, 1, 2, 3),
+                  frozen_stages=1,
+                  norm_cfg=dict(type='BN', requires_grad=False),
+                  norm_eval=True,
+                  style='pytorch',
+                  init_cfg=dict(type='Pretrained',
+                                checkpoint='torchvision://resnet50')),
+    bbox_head=dict(type='detr4segHead',
+                   num_classes=80,
+                   in_channels=2048,
+                   transformer=dict(
+                       type='Transformer',
+                       encoder=dict(type='DetrTransformerEncoder',
+                                    num_layers=6,
+                                    transformerlayers=dict(
+                                        type='BaseTransformerLayer',
+                                        attn_cfgs=[
+                                            dict(type='MultiheadAttention',
+                                                 embed_dims=256,
+                                                 num_heads=8,
+                                                 dropout=0.1)
+                                        ],
+                                        feedforward_channels=2048,
+                                        ffn_dropout=0.1,
+                                        operation_order=('self_attn', 'norm',
+                                                         'ffn', 'norm'))),
+                       decoder=dict(
+                           type='DetrTransformerDecoder',
+                           return_intermediate=True,
+                           num_layers=6,
+                           transformerlayers=dict(
+                               type='DetrTransformerDecoderLayer',
+                               attn_cfgs=dict(type='MultiheadAttention',
+                                              embed_dims=256,
+                                              num_heads=8,
+                                              dropout=0.1),
+                               feedforward_channels=2048,
+                               ffn_dropout=0.1,
+                               operation_order=('self_attn', 'norm',
+                                                'cross_attn', 'norm', 'ffn',
+                                                'norm')),
+                       )),
+                   positional_encoding=dict(type='SinePositionalEncoding',
+                                            num_feats=128,
+                                            normalize=True),
+                   loss_cls=dict(type='CrossEntropyLoss',
+                                 use_sigmoid=False,
+                                 loss_weight=1.0,
+                                 class_weight=1.0),
+                   loss_bbox=dict(type='L1Loss', loss_weight=5.0),
+                   loss_iou=dict(type='GIoULoss', loss_weight=2.0),
+                   focal_loss=dict(type='BCEFocalLoss', loss_weight=1.0),
+                   dice_loss=dict(type='psgtrDiceLoss', loss_weight=1.0)),
+    # training and testing settings
+    train_cfg=dict(assigner=dict(
+        type='HungarianAssigner',
+        cls_cost=dict(type='ClassificationCost', weight=1.),
+        reg_cost=dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'),
+        iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0))),
+    test_cfg=dict(max_per_img=100))
diff --git a/OpenPSG/configs/_base_/models/detr4seg_r50_psg.py b/OpenPSG/configs/_base_/models/detr4seg_r50_psg.py
new file mode 100644
index 0000000000000000000000000000000000000000..07324d4942419d7879ce771a19cc8215a45fd5d2
--- /dev/null
+++ b/OpenPSG/configs/_base_/models/detr4seg_r50_psg.py
@@ -0,0 +1,152 @@
+_base_ = ['./detr4seg_r50.py', '../datasets/psg.py', '../custom_runtime.py']
+
+custom_imports = dict(imports=[
+    'openpsg.models.frameworks.detr4seg',
+    'openpsg.models.relation_heads.detr4seg_head', 'openpsg.datasets',
+    'openpsg.datasets.pipelines.loading',
+    'openpsg.datasets.pipelines.rel_randomcrop',
+    'openpsg.models.relation_heads.approaches.matcher',
+    'openpsg.models.losses.seg_losses'
+],
+                      allow_failed_imports=False)
+
+object_classes = [
+    'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train',
+    'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign',
+    'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
+    'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag',
+    'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite',
+    'baseball bat', 'baseball glove', 'skateboard', 'surfboard',
+    'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon',
+    'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot',
+    'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant',
+    'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote',
+    'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink',
+    'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear',
+    'hair drier', 'toothbrush', 'banner', 'blanket', 'bridge', 'cardboard',
+    'counter', 'curtain', 'door-stuff', 'floor-wood', 'flower', 'fruit',
+    'gravel', 'house', 'light', 'mirror-stuff', 'net', 'pillow', 'platform',
+    'playingfield', 'railroad', 'river', 'road', 'roof', 'sand', 'sea',
+    'shelf', 'snow', 'stairs', 'tent', 'towel', 'wall-brick', 'wall-stone',
+    'wall-tile', 'wall-wood', 'water-other', 'window-blind', 'window-other',
+    'tree-merged', 'fence-merged', 'ceiling-merged', 'sky-other-merged',
+    'cabinet-merged', 'table-merged', 'floor-other-merged', 'pavement-merged',
+    'mountain-merged', 'grass-merged', 'dirt-merged', 'paper-merged',
+    'food-other-merged', 'building-other-merged', 'rock-merged',
+    'wall-other-merged', 'rug-merged'
+]
+
+model = dict(bbox_head=dict(
+    num_classes=len(object_classes),
+    object_classes=object_classes,
+))
+
+img_norm_cfg = dict(mean=[123.675, 116.28, 103.53],
+                    std=[58.395, 57.12, 57.375],
+                    to_rgb=True)
+# train_pipeline, NOTE the img_scale and the Pad's size_divisor is different
+# from the default setting in mmdet.
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadPanopticSceneGraphAnnotations',
+         with_bbox=True,
+         with_mask=True,
+         with_seg=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(
+        type='AutoAugment',
+        policies=[
+            [
+                dict(type='Resize',
+                     img_scale=[(480, 1333), (512, 1333), (544, 1333),
+                                (576, 1333), (608, 1333), (640, 1333),
+                                (672, 1333), (704, 1333), (736, 1333),
+                                (768, 1333), (800, 1333)],
+                     multiscale_mode='value',
+                     keep_ratio=True)
+            ],
+            [
+                dict(type='Resize',
+                     img_scale=[(400, 1333), (500, 1333), (600, 1333)],
+                     multiscale_mode='value',
+                     keep_ratio=True),
+                dict(type='RandomCrop',
+                     crop_type='absolute_range',
+                     crop_size=(384, 600),
+                     allow_negative_crop=False),  # no empty relations
+                dict(type='Resize',
+                     img_scale=[(480, 1333), (512, 1333), (544, 1333),
+                                (576, 1333), (608, 1333), (640, 1333),
+                                (672, 1333), (704, 1333), (736, 1333),
+                                (768, 1333), (800, 1333)],
+                     multiscale_mode='value',
+                     override=True,
+                     keep_ratio=True)
+            ]
+        ]),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=1),
+    dict(type='RelsFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks'])
+]
+# test_pipeline, NOTE the Pad's size_divisor is different from the default
+# setting (size_divisor=32). While there is little effect on the performance
+# whether we use the default setting or use size_divisor=1.
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='MultiScaleFlipAug',
+         img_scale=(1333, 800),
+         flip=False,
+         transforms=[
+             dict(type='Resize', keep_ratio=True),
+             dict(type='RandomFlip'),
+             dict(type='Normalize', **img_norm_cfg),
+             dict(type='Pad', size_divisor=1),
+             dict(type='ImageToTensor', keys=['img']),
+             dict(type='Collect', keys=['img'])
+         ])
+]
+data = dict(samples_per_gpu=1,
+            workers_per_gpu=1,
+            train=dict(pipeline=train_pipeline),
+            val=dict(pipeline=test_pipeline),
+            test=dict(pipeline=test_pipeline))
+# optimizer
+optimizer = dict(type='AdamW',
+                 lr=0.00001,
+                 weight_decay=0.0001,
+                 paramwise_cfg=dict(
+                     custom_keys={
+                         'backbone': dict(lr_mult=0.1, decay_mult=1.0),
+                         'bbox_attention': dict(lr_mult=10.0, decay_mult=1.0),
+                         'mask_head': dict(lr_mult=10.0, decay_mult=1.0)
+                     }))
+optimizer_config = dict(grad_clip=dict(max_norm=0.1, norm_type=2))
+
+# learning policy
+lr_config = dict(policy='step', step=8)
+runner = dict(type='EpochBasedRunner', max_epochs=10)
+
+evaluation = dict(interval=1, metric='PQ')
+checkpoint_config = dict(interval=1, max_keep_ckpts=10)
+
+project_name = 'detr4seg'
+expt_name = 'test_detr4seg_r50_psg'
+work_dir = f'./work_dirs/{expt_name}'
+
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook'),
+        dict(
+            type='WandbLoggerHook',
+            init_kwargs=dict(
+                project=project_name,
+                name=expt_name,
+                # config=work_dir + "/cfg.yaml"
+            ))
+    ],
+)
+
+load_from = 'detr_pan_r50.pth'
diff --git a/OpenPSG/configs/_base_/models/detr_r50.py b/OpenPSG/configs/_base_/models/detr_r50.py
new file mode 100644
index 0000000000000000000000000000000000000000..b83d7d5e108ff52eb9c2c8701697684e1fd88844
--- /dev/null
+++ b/OpenPSG/configs/_base_/models/detr_r50.py
@@ -0,0 +1,64 @@
+model = dict(
+    type='DETR',
+    backbone=dict(type='ResNet',
+                  depth=50,
+                  num_stages=4,
+                  out_indices=(3, ),
+                  frozen_stages=1,
+                  norm_cfg=dict(type='BN', requires_grad=False),
+                  norm_eval=True,
+                  style='pytorch',
+                  init_cfg=dict(type='Pretrained',
+                                checkpoint='torchvision://resnet50')),
+    bbox_head=dict(type='DETRHead',
+                   num_classes=80,
+                   in_channels=2048,
+                   transformer=dict(
+                       type='Transformer',
+                       encoder=dict(type='DetrTransformerEncoder',
+                                    num_layers=6,
+                                    transformerlayers=dict(
+                                        type='BaseTransformerLayer',
+                                        attn_cfgs=[
+                                            dict(type='MultiheadAttention',
+                                                 embed_dims=256,
+                                                 num_heads=8,
+                                                 dropout=0.1)
+                                        ],
+                                        feedforward_channels=2048,
+                                        ffn_dropout=0.1,
+                                        operation_order=('self_attn', 'norm',
+                                                         'ffn', 'norm'))),
+                       decoder=dict(
+                           type='DetrTransformerDecoder',
+                           return_intermediate=True,
+                           num_layers=6,
+                           transformerlayers=dict(
+                               type='DetrTransformerDecoderLayer',
+                               attn_cfgs=dict(type='MultiheadAttention',
+                                              embed_dims=256,
+                                              num_heads=8,
+                                              dropout=0.1),
+                               feedforward_channels=2048,
+                               ffn_dropout=0.1,
+                               operation_order=('self_attn', 'norm',
+                                                'cross_attn', 'norm', 'ffn',
+                                                'norm')),
+                       )),
+                   positional_encoding=dict(type='SinePositionalEncoding',
+                                            num_feats=128,
+                                            normalize=True),
+                   loss_cls=dict(type='CrossEntropyLoss',
+                                 bg_cls_weight=0.1,
+                                 use_sigmoid=False,
+                                 loss_weight=1.0,
+                                 class_weight=1.0),
+                   loss_bbox=dict(type='L1Loss', loss_weight=5.0),
+                   loss_iou=dict(type='GIoULoss', loss_weight=2.0)),
+    # training and testing settings
+    train_cfg=dict(assigner=dict(
+        type='HungarianAssigner',
+        cls_cost=dict(type='ClassificationCost', weight=1.),
+        reg_cost=dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'),
+        iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0))),
+    test_cfg=dict(max_per_img=100))
diff --git a/OpenPSG/configs/_base_/models/mask_rcnn_r50_fpn.py b/OpenPSG/configs/_base_/models/mask_rcnn_r50_fpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..19b9c3d29c7af8ac828c25a1b388248aa23a2d77
--- /dev/null
+++ b/OpenPSG/configs/_base_/models/mask_rcnn_r50_fpn.py
@@ -0,0 +1,107 @@
+# model settings
+model = dict(
+    type='MaskRCNN',
+    backbone=dict(type='ResNet',
+                  depth=50,
+                  num_stages=4,
+                  out_indices=(0, 1, 2, 3),
+                  frozen_stages=1,
+                  norm_cfg=dict(type='BN', requires_grad=True),
+                  norm_eval=True,
+                  style='pytorch',
+                  init_cfg=dict(type='Pretrained',
+                                checkpoint='torchvision://resnet50')),
+    neck=dict(type='FPN',
+              in_channels=[256, 512, 1024, 2048],
+              out_channels=256,
+              num_outs=5),
+    rpn_head=dict(type='RPNHead',
+                  in_channels=256,
+                  feat_channels=256,
+                  anchor_generator=dict(type='AnchorGenerator',
+                                        scales=[8],
+                                        ratios=[0.5, 1.0, 2.0],
+                                        strides=[4, 8, 16, 32, 64]),
+                  bbox_coder=dict(type='DeltaXYWHBBoxCoder',
+                                  target_means=[.0, .0, .0, .0],
+                                  target_stds=[1.0, 1.0, 1.0, 1.0]),
+                  loss_cls=dict(type='CrossEntropyLoss',
+                                use_sigmoid=True,
+                                loss_weight=1.0),
+                  loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
+    roi_head=dict(type='StandardRoIHead',
+                  bbox_roi_extractor=dict(type='SingleRoIExtractor',
+                                          roi_layer=dict(type='RoIAlign',
+                                                         output_size=7,
+                                                         sampling_ratio=0),
+                                          out_channels=256,
+                                          featmap_strides=[4, 8, 16, 32]),
+                  bbox_head=dict(
+                      type='Shared2FCBBoxHead',
+                      in_channels=256,
+                      fc_out_channels=1024,
+                      roi_feat_size=7,
+                      num_classes=80,
+                      bbox_coder=dict(type='DeltaXYWHBBoxCoder',
+                                      target_means=[0., 0., 0., 0.],
+                                      target_stds=[0.1, 0.1, 0.2, 0.2]),
+                      reg_class_agnostic=False,
+                      loss_cls=dict(type='CrossEntropyLoss',
+                                    use_sigmoid=False,
+                                    loss_weight=1.0),
+                      loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
+                  mask_roi_extractor=dict(type='SingleRoIExtractor',
+                                          roi_layer=dict(type='RoIAlign',
+                                                         output_size=14,
+                                                         sampling_ratio=0),
+                                          out_channels=256,
+                                          featmap_strides=[4, 8, 16, 32]),
+                  mask_head=dict(type='FCNMaskHead',
+                                 num_convs=4,
+                                 in_channels=256,
+                                 conv_out_channels=256,
+                                 num_classes=80,
+                                 loss_mask=dict(type='CrossEntropyLoss',
+                                                use_mask=True,
+                                                loss_weight=1.0))),
+    # model training and testing settings
+    train_cfg=dict(rpn=dict(assigner=dict(type='MaxIoUAssigner',
+                                          pos_iou_thr=0.7,
+                                          neg_iou_thr=0.3,
+                                          min_pos_iou=0.3,
+                                          match_low_quality=True,
+                                          ignore_iof_thr=-1),
+                            sampler=dict(type='RandomSampler',
+                                         num=256,
+                                         pos_fraction=0.5,
+                                         neg_pos_ub=-1,
+                                         add_gt_as_proposals=False),
+                            allowed_border=-1,
+                            pos_weight=-1,
+                            debug=False),
+                   rpn_proposal=dict(nms_pre=2000,
+                                     max_per_img=1000,
+                                     nms=dict(type='nms', iou_threshold=0.7),
+                                     min_bbox_size=0),
+                   rcnn=dict(assigner=dict(type='MaxIoUAssigner',
+                                           pos_iou_thr=0.5,
+                                           neg_iou_thr=0.5,
+                                           min_pos_iou=0.5,
+                                           match_low_quality=True,
+                                           ignore_iof_thr=-1),
+                             sampler=dict(type='RandomSampler',
+                                          num=512,
+                                          pos_fraction=0.25,
+                                          neg_pos_ub=-1,
+                                          add_gt_as_proposals=True),
+                             mask_size=28,
+                             pos_weight=-1,
+                             debug=False)),
+    test_cfg=dict(rpn=dict(nms_pre=1000,
+                           max_per_img=1000,
+                           nms=dict(type='nms', iou_threshold=0.7),
+                           min_bbox_size=0),
+                  rcnn=dict(score_thr=0.05,
+                            nms=dict(type='nms', iou_threshold=0.5),
+                            max_per_img=100,
+                            mask_thr_binary=0.5)))
diff --git a/OpenPSG/configs/_base_/models/panoptic_fpn_r101_fpn_psg.py b/OpenPSG/configs/_base_/models/panoptic_fpn_r101_fpn_psg.py
new file mode 100644
index 0000000000000000000000000000000000000000..449ec6c9ff81c8447bc74029fad68d1bb3dc9598
--- /dev/null
+++ b/OpenPSG/configs/_base_/models/panoptic_fpn_r101_fpn_psg.py
@@ -0,0 +1,8 @@
+_base_ = './panoptic_fpn_r50_fpn_psg.py'
+
+model = dict(backbone=dict(
+    depth=101,
+    init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101')))
+
+expt_name = 'panoptic_fpn_r101_fpn_psg'
+load_from = 'work_dirs/checkpoints/panoptic_fpn_r101_fpn_1x_coco_20210820_193950-ab9157a2.pth'
diff --git a/OpenPSG/configs/_base_/models/panoptic_fpn_r50_fpn_psg.py b/OpenPSG/configs/_base_/models/panoptic_fpn_r50_fpn_psg.py
new file mode 100644
index 0000000000000000000000000000000000000000..44a01a4ea386ddb8c4264a6454da4d70ffde63fc
--- /dev/null
+++ b/OpenPSG/configs/_base_/models/panoptic_fpn_r50_fpn_psg.py
@@ -0,0 +1,74 @@
+_base_ = [
+    '../models/mask_rcnn_r50_fpn.py',
+    '../datasets/psg_panoptic.py',
+    '../schedules/schedule_1x.py',
+    '../custom_runtime.py',
+]
+
+model = dict(
+    type='PanopticFPN',
+    semantic_head=dict(
+        type='PanopticFPNHead',
+        num_things_classes=80,
+        num_stuff_classes=53,
+        in_channels=256,
+        inner_channels=128,
+        start_level=0,
+        end_level=4,
+        norm_cfg=dict(type='GN', num_groups=32, requires_grad=True),
+        conv_cfg=None,
+        loss_seg=dict(type='CrossEntropyLoss',
+                      ignore_index=255,
+                      loss_weight=0.5),
+    ),
+    panoptic_fusion_head=dict(type='HeuristicFusionHead',
+                              num_things_classes=80,
+                              num_stuff_classes=53),
+    test_cfg=dict(panoptic=dict(
+        score_thr=0.6,
+        max_per_img=100,
+        mask_thr_binary=0.5,
+        mask_overlap=0.5,
+        nms=dict(type='nms', iou_threshold=0.5, class_agnostic=True),
+        stuff_area_limit=4096,
+    )),
+)
+
+custom_hooks = []
+
+# Change batch size and learning rate
+data = dict(samples_per_gpu=8,
+            # workers_per_gpu=2
+            )
+# optimizer = dict(lr=0.02)
+optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
+optimizer_config = dict(_delete_=True,
+                        grad_clip=dict(max_norm=35, norm_type=2))
+
+lr_config = dict(policy='step',
+                 warmup='linear',
+                 warmup_iters=500,
+                 warmup_ratio=1.0 / 3,
+                 step=[8, 11])
+
+project_name = 'openpsg'
+expt_name = 'panoptic_fpn_r50_fpn_psg'
+work_dir = f'./work_dirs/{expt_name}'
+
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+        dict(
+            type='WandbLoggerHook',
+            init_kwargs=dict(
+                project=project_name,
+                name=expt_name,
+                # config=work_dir + "/cfg.yaml"
+            ),
+        ),
+    ],
+)
+
+load_from = 'work_dirs/checkpoints/panoptic_fpn_r50_fpn_1x_coco_20210821_101153-9668fd13.pth'
diff --git a/OpenPSG/configs/_base_/models/psgtr_r101.py b/OpenPSG/configs/_base_/models/psgtr_r101.py
new file mode 100644
index 0000000000000000000000000000000000000000..28a043e12a54656ed52202a348058bd0dc3d6f9d
--- /dev/null
+++ b/OpenPSG/configs/_base_/models/psgtr_r101.py
@@ -0,0 +1,5 @@
+_base_ = './psgtr_r50.py'
+
+model = dict(backbone=dict(
+    depth=101,
+    init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101')))
diff --git a/OpenPSG/configs/_base_/models/psgtr_r50.py b/OpenPSG/configs/_base_/models/psgtr_r50.py
new file mode 100644
index 0000000000000000000000000000000000000000..96eccd68df077c5de98613fe62d4bcacb5b7f5a4
--- /dev/null
+++ b/OpenPSG/configs/_base_/models/psgtr_r50.py
@@ -0,0 +1,82 @@
+model = dict(
+    type='PSGTr',
+    backbone=dict(type='ResNet',
+                  depth=50,
+                  num_stages=4,
+                  out_indices=(0, 1, 2, 3),
+                  frozen_stages=1,
+                  norm_cfg=dict(type='BN', requires_grad=False),
+                  norm_eval=True,
+                  style='pytorch',
+                  init_cfg=dict(type='Pretrained',
+                                checkpoint='torchvision://resnet50')),
+    bbox_head=dict(type='PSGTrHead',
+                   num_classes=80,
+                   num_relations=117,
+                   in_channels=2048,
+                   transformer=dict(
+                       type='Transformer',
+                       encoder=dict(type='DetrTransformerEncoder',
+                                    num_layers=6,
+                                    transformerlayers=dict(
+                                        type='BaseTransformerLayer',
+                                        attn_cfgs=[
+                                            dict(type='MultiheadAttention',
+                                                 embed_dims=256,
+                                                 num_heads=8,
+                                                 dropout=0.1)
+                                        ],
+                                        feedforward_channels=2048,
+                                        ffn_dropout=0.1,
+                                        operation_order=('self_attn', 'norm',
+                                                         'ffn', 'norm'))),
+                       decoder=dict(
+                           type='DetrTransformerDecoder',
+                           return_intermediate=True,
+                           num_layers=6,
+                           transformerlayers=dict(
+                               type='DetrTransformerDecoderLayer',
+                               attn_cfgs=dict(type='MultiheadAttention',
+                                              embed_dims=256,
+                                              num_heads=8,
+                                              dropout=0.1),
+                               feedforward_channels=2048,
+                               ffn_dropout=0.1,
+                               operation_order=('self_attn', 'norm',
+                                                'cross_attn', 'norm', 'ffn',
+                                                'norm')),
+                       )),
+                   positional_encoding=dict(type='SinePositionalEncoding',
+                                            num_feats=128,
+                                            normalize=True),
+                   sub_loss_cls=dict(type='CrossEntropyLoss',
+                                     use_sigmoid=False,
+                                     loss_weight=1.0,
+                                     class_weight=1.0),
+                   sub_loss_bbox=dict(type='L1Loss', loss_weight=5.0),
+                   sub_loss_iou=dict(type='GIoULoss', loss_weight=2.0),
+                   sub_focal_loss=dict(type='BCEFocalLoss', loss_weight=1.0),
+                   sub_dice_loss=dict(type='psgtrDiceLoss', loss_weight=1.0),
+                   obj_loss_cls=dict(type='CrossEntropyLoss',
+                                     use_sigmoid=False,
+                                     loss_weight=1.0,
+                                     class_weight=1.0),
+                   obj_loss_bbox=dict(type='L1Loss', loss_weight=5.0),
+                   obj_loss_iou=dict(type='GIoULoss', loss_weight=2.0),
+                   obj_focal_loss=dict(type='BCEFocalLoss', loss_weight=1.0),
+                   obj_dice_loss=dict(type='psgtrDiceLoss', loss_weight=1.0),
+                   rel_loss_cls=dict(type='CrossEntropyLoss',
+                                     use_sigmoid=False,
+                                     loss_weight=2.0,
+                                     class_weight=1.0)),
+    # training and testing settings
+    train_cfg=dict(assigner=dict(
+        type='HTriMatcher',
+        s_cls_cost=dict(type='ClassificationCost', weight=1.),
+        s_reg_cost=dict(type='BBoxL1Cost', weight=5.0),
+        s_iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0),
+        o_cls_cost=dict(type='ClassificationCost', weight=1.),
+        o_reg_cost=dict(type='BBoxL1Cost', weight=5.0),
+        o_iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0),
+        r_cls_cost=dict(type='ClassificationCost', weight=2.))),
+    test_cfg=dict(max_per_img=100))
diff --git a/OpenPSG/configs/_base_/schedules/schedule_1x.py b/OpenPSG/configs/_base_/schedules/schedule_1x.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c01d3df3d9169fee87ffeaa4e0fb60ac3f07b66
--- /dev/null
+++ b/OpenPSG/configs/_base_/schedules/schedule_1x.py
@@ -0,0 +1,10 @@
+# optimizer
+optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(policy='step',
+                 warmup='linear',
+                 warmup_iters=500,
+                 warmup_ratio=0.001,
+                 step=[8, 11])
+runner = dict(type='EpochBasedRunner', max_epochs=12)
diff --git a/OpenPSG/configs/_base_/schedules/schedule_3x.py b/OpenPSG/configs/_base_/schedules/schedule_3x.py
new file mode 100644
index 0000000000000000000000000000000000000000..4109da969702ecb2962606ec3891cedfcd4cd2ae
--- /dev/null
+++ b/OpenPSG/configs/_base_/schedules/schedule_3x.py
@@ -0,0 +1,10 @@
+# optimizer
+optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(policy='step',
+                 warmup='linear',
+                 warmup_iters=1000,
+                 warmup_ratio=0.001,
+                 step=[27, 33])
+runner = dict(type='EpochBasedRunner', max_epochs=36)
diff --git a/OpenPSG/configs/gpsnet/panoptic_fpn_r101_fpn_1x_predcls_psg.py b/OpenPSG/configs/gpsnet/panoptic_fpn_r101_fpn_1x_predcls_psg.py
new file mode 100644
index 0000000000000000000000000000000000000000..1be5fdcf74eeb3e941ef2829546cfb14338face8
--- /dev/null
+++ b/OpenPSG/configs/gpsnet/panoptic_fpn_r101_fpn_1x_predcls_psg.py
@@ -0,0 +1,26 @@
+_base_ = './panoptic_fpn_r50_fpn_1x_predcls_psg.py'
+
+model = dict(backbone=dict(
+    depth=101,
+    init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101')))
+
+# Log config
+project_name = 'openpsg'
+expt_name = 'gpsnet_panoptic_fpn_r101_fpn_1x_predcls_psg'
+work_dir = f'./work_dirs/{expt_name}'
+
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(
+            type='WandbLoggerHook',
+            init_kwargs=dict(
+                project=project_name,
+                name=expt_name,
+            ),
+        ),
+    ],
+)
+
+load_from = 'work_dirs/checkpoints/panoptic_fpn_r101_fpn_1x_coco_20210820_193950-ab9157a2.pth'
diff --git a/OpenPSG/configs/gpsnet/panoptic_fpn_r101_fpn_1x_sgdet_psg.py b/OpenPSG/configs/gpsnet/panoptic_fpn_r101_fpn_1x_sgdet_psg.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ae604515cefc1aa3849ee328c1667408f08cab4
--- /dev/null
+++ b/OpenPSG/configs/gpsnet/panoptic_fpn_r101_fpn_1x_sgdet_psg.py
@@ -0,0 +1,26 @@
+_base_ = './panoptic_fpn_r50_fpn_1x_sgdet_psg.py'
+
+model = dict(backbone=dict(
+    depth=101,
+    init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101')))
+
+# Log config
+project_name = 'openpsg'
+expt_name = 'gpsnet_panoptic_fpn_r101_fpn_1x_sgdet_psg'
+work_dir = f'./work_dirs/{expt_name}'
+
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(
+            type='WandbLoggerHook',
+            init_kwargs=dict(
+                project=project_name,
+                name=expt_name,
+            ),
+        ),
+    ],
+)
+
+load_from = 'work_dirs/checkpoints/panoptic_fpn_r101_fpn_1x_coco_20210820_193950-ab9157a2.pth'
diff --git a/OpenPSG/configs/gpsnet/panoptic_fpn_r50_fpn_1x_predcls_psg.py b/OpenPSG/configs/gpsnet/panoptic_fpn_r50_fpn_1x_predcls_psg.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd06ebcd9c19aec5210937600af4db0d66d99def
--- /dev/null
+++ b/OpenPSG/configs/gpsnet/panoptic_fpn_r50_fpn_1x_predcls_psg.py
@@ -0,0 +1,41 @@
+_base_ = [
+    '../motifs/panoptic_fpn_r50_fpn_1x_predcls_psg.py',
+]
+
+model = dict(relation_head=dict(
+    type='GPSHead',
+    head_config=dict(
+        # NOTE: Evaluation type
+        use_gt_box=True,
+        use_gt_label=True,
+    ),
+))
+
+evaluation = dict(interval=1,
+                  metric='predcls',
+                  relation_mode=True,
+                  classwise=True,
+                  detection_method='pan_seg')
+
+# Change batch size and learning rate
+data = dict(samples_per_gpu=16, workers_per_gpu=0)
+optimizer = dict(type='SGD', lr=0.03, momentum=0.9, weight_decay=0.0001)
+
+# Log config
+project_name = 'openpsg'
+expt_name = 'gpsnet_panoptic_fpn_r50_fpn_1x_predcls_psg'
+work_dir = f'./work_dirs/{expt_name}'
+
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(
+            type='WandbLoggerHook',
+            init_kwargs=dict(
+                project=project_name,
+                name=expt_name,
+            ),
+        ),
+    ],
+)
diff --git a/OpenPSG/configs/gpsnet/panoptic_fpn_r50_fpn_1x_sgdet_psg.py b/OpenPSG/configs/gpsnet/panoptic_fpn_r50_fpn_1x_sgdet_psg.py
new file mode 100644
index 0000000000000000000000000000000000000000..78165a4ce56b57819445d8d58840c6f9fca5f4a8
--- /dev/null
+++ b/OpenPSG/configs/gpsnet/panoptic_fpn_r50_fpn_1x_sgdet_psg.py
@@ -0,0 +1,45 @@
+_base_ = [
+    '../motifs/panoptic_fpn_r50_fpn_1x_predcls_psg.py',
+]
+
+model = dict(
+    relation_head=dict(
+        type='GPSHead',
+        head_config=dict(
+            # NOTE: Evaluation type
+            use_gt_box=False,
+            use_gt_label=False,
+        ),
+    ),
+    roi_head=dict(bbox_head=dict(type='SceneGraphBBoxHead'), ),
+)
+
+evaluation = dict(
+    interval=1,
+    metric='sgdet',
+    relation_mode=True,
+    classwise=True,
+    iou_thrs=0.5,
+    detection_method='pan_seg',
+)
+
+data = dict(samples_per_gpu=16)
+
+# Log config
+project_name = 'openpsg'
+expt_name = 'gpsnet_panoptic_fpn_r50_fpn_1x_sgdet_psg'
+work_dir = f'./work_dirs/{expt_name}'
+
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(
+            type='WandbLoggerHook',
+            init_kwargs=dict(
+                project=project_name,
+                name=expt_name,
+            ),
+        ),
+    ],
+)
diff --git a/OpenPSG/configs/imp/panoptic_fpn_r101_fpn_1x_predcls_psg.py b/OpenPSG/configs/imp/panoptic_fpn_r101_fpn_1x_predcls_psg.py
new file mode 100644
index 0000000000000000000000000000000000000000..28bc0487451535069f9301853e0190fc9025bb85
--- /dev/null
+++ b/OpenPSG/configs/imp/panoptic_fpn_r101_fpn_1x_predcls_psg.py
@@ -0,0 +1,28 @@
+_base_ = './panoptic_fpn_r50_fpn_1x_predcls_psg.py'
+
+model = dict(backbone=dict(
+    depth=101,
+    init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101')))
+
+# Log config
+project_name = 'openpsg'
+expt_name = 'imp_panoptic_fpn_r101_fpn_1x_predcls_psg'
+work_dir = f'./work_dirs/{expt_name}'
+
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+        dict(
+            type='WandbLoggerHook',
+            init_kwargs=dict(
+                project=project_name,
+                name=expt_name,
+                # config=work_dir + "/cfg.yaml"
+            ),
+        ),
+    ],
+)
+
+load_from = 'work_dirs/checkpoints/panoptic_fpn_r101_fpn_1x_coco_20210820_193950-ab9157a2.pth'
diff --git a/OpenPSG/configs/imp/panoptic_fpn_r101_fpn_1x_sgdet_psg.py b/OpenPSG/configs/imp/panoptic_fpn_r101_fpn_1x_sgdet_psg.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f0f96866d423e0f6a214e98462c721626744309
--- /dev/null
+++ b/OpenPSG/configs/imp/panoptic_fpn_r101_fpn_1x_sgdet_psg.py
@@ -0,0 +1,26 @@
+_base_ = './panoptic_fpn_r50_fpn_1x_sgdet_psg.py'
+
+model = dict(backbone=dict(
+    depth=101,
+    init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101')))
+
+# Log config
+project_name = 'openpsg'
+expt_name = 'imp_panoptic_fpn_r101_fpn_1x_sgdet_psg'
+work_dir = f'./work_dirs/{expt_name}'
+
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(
+            type='WandbLoggerHook',
+            init_kwargs=dict(
+                project=project_name,
+                name=expt_name,
+            ),
+        ),
+    ],
+)
+
+load_from = 'work_dirs/checkpoints/panoptic_fpn_r101_fpn_1x_coco_20210820_193950-ab9157a2.pth'
diff --git a/OpenPSG/configs/imp/panoptic_fpn_r50_fpn_1x_predcls_psg.py b/OpenPSG/configs/imp/panoptic_fpn_r50_fpn_1x_predcls_psg.py
new file mode 100644
index 0000000000000000000000000000000000000000..93189cfd37a51374fe62e29b0bc8550559da3a27
--- /dev/null
+++ b/OpenPSG/configs/imp/panoptic_fpn_r50_fpn_1x_predcls_psg.py
@@ -0,0 +1,44 @@
+_base_ = [
+    '../motifs/panoptic_fpn_r50_fpn_1x_predcls_psg.py',
+]
+
+model = dict(relation_head=dict(
+    type='IMPHead',
+    head_config=dict(
+        # NOTE: Evaluation type
+        use_gt_box=True,
+        use_gt_label=True,
+        num_iter=2,
+    ),
+))
+
+evaluation = dict(interval=1,
+                  metric='predcls',
+                  relation_mode=True,
+                  classwise=True)
+
+# Change batch size and learning rate
+data = dict(samples_per_gpu=16, )
+# workers_per_gpu=0)  # FIXME: Is this the problem?
+optimizer = dict(type='SGD', lr=0.001, momentum=0.9)
+
+# Log config
+project_name = 'openpsg'
+expt_name = 'imp_panoptic_fpn_r50_fpn_1x_predcls_psg'
+work_dir = f'./work_dirs/{expt_name}'
+
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+        dict(
+            type='WandbLoggerHook',
+            init_kwargs=dict(
+                project=project_name,
+                name=expt_name,
+                # config=work_dir + "/cfg.yaml"
+            ),
+        ),
+    ],
+)
diff --git a/OpenPSG/configs/imp/panoptic_fpn_r50_fpn_1x_sgdet_psg.py b/OpenPSG/configs/imp/panoptic_fpn_r50_fpn_1x_sgdet_psg.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ec83492bfccc1b706723b6de680392f9b0e2c7a
--- /dev/null
+++ b/OpenPSG/configs/imp/panoptic_fpn_r50_fpn_1x_sgdet_psg.py
@@ -0,0 +1,48 @@
+_base_ = [
+    '../motifs/panoptic_fpn_r50_fpn_1x_predcls_psg.py',
+]
+
+model = dict(relation_head=dict(
+    type='IMPHead',
+    head_config=dict(
+        # NOTE: Evaluation type
+        use_gt_box=False,
+        use_gt_label=False,
+        num_iter=2,
+    ),
+))
+
+evaluation = dict(
+    interval=1,
+    metric='sgdet',
+    relation_mode=True,
+    classwise=True,
+    iou_thrs=0.5,
+    detection_method='pan_seg',
+)
+
+# Change batch size and learning rate
+data = dict(samples_per_gpu=16, )
+# workers_per_gpu=0)  # FIXME: Is this the problem?
+optimizer = dict(type='SGD', lr=0.001, momentum=0.9)
+
+# Log config
+project_name = 'openpsg'
+expt_name = 'imp_panoptic_fpn_r50_fpn_1x_sgdet_psg'
+work_dir = f'./work_dirs/{expt_name}'
+
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+        dict(
+            type='WandbLoggerHook',
+            init_kwargs=dict(
+                project=project_name,
+                name=expt_name,
+                # config=work_dir + "/cfg.yaml"
+            ),
+        ),
+    ],
+)
diff --git a/OpenPSG/configs/motifs/panoptic_fpn_r101_fpn_1x_predcls_psg.py b/OpenPSG/configs/motifs/panoptic_fpn_r101_fpn_1x_predcls_psg.py
new file mode 100644
index 0000000000000000000000000000000000000000..d125d475b96e26c7862d16b5335798ee9defab44
--- /dev/null
+++ b/OpenPSG/configs/motifs/panoptic_fpn_r101_fpn_1x_predcls_psg.py
@@ -0,0 +1,28 @@
+_base_ = './panoptic_fpn_r50_fpn_1x_predcls_psg.py'
+
+model = dict(backbone=dict(
+    depth=101,
+    init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101')))
+
+# Log config
+project_name = 'openpsg'
+expt_name = 'motifs_panoptic_fpn_r101_fpn_1x_predcls_psg'
+work_dir = f'./work_dirs/{expt_name}'
+
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+        dict(
+            type='WandbLoggerHook',
+            init_kwargs=dict(
+                project=project_name,
+                name=expt_name,
+                # config=work_dir + "/cfg.yaml"
+            ),
+        ),
+    ],
+)
+
+load_from = 'work_dirs/checkpoints/panoptic_fpn_r101_fpn_1x_coco_20210820_193950-ab9157a2.pth'
diff --git a/OpenPSG/configs/motifs/panoptic_fpn_r101_fpn_1x_sgdet_psg.py b/OpenPSG/configs/motifs/panoptic_fpn_r101_fpn_1x_sgdet_psg.py
new file mode 100644
index 0000000000000000000000000000000000000000..55b1f9eadee9904706504b57f896e2e6482d6385
--- /dev/null
+++ b/OpenPSG/configs/motifs/panoptic_fpn_r101_fpn_1x_sgdet_psg.py
@@ -0,0 +1,28 @@
+_base_ = './panoptic_fpn_r50_fpn_1x_sgdet_psg.py'
+
+model = dict(backbone=dict(
+    depth=101,
+    init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101')))
+
+# Log config
+project_name = 'openpsg'
+expt_name = 'motifs_panoptic_fpn_r101_fpn_1x_sgdet_psg'
+work_dir = f'./work_dirs/{expt_name}'
+
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+        dict(
+            type='WandbLoggerHook',
+            init_kwargs=dict(
+                project=project_name,
+                name=expt_name,
+                # config=work_dir + "/cfg.yaml"
+            ),
+        ),
+    ],
+)
+
+load_from = 'work_dirs/checkpoints/panoptic_fpn_r101_fpn_1x_coco_20210820_193950-ab9157a2.pth'
diff --git a/OpenPSG/configs/motifs/panoptic_fpn_r50_fpn_1x_predcls_psg.py b/OpenPSG/configs/motifs/panoptic_fpn_r50_fpn_1x_predcls_psg.py
new file mode 100644
index 0000000000000000000000000000000000000000..72e2afc7e139a93749fcb28f8f8a7b4c3612478d
--- /dev/null
+++ b/OpenPSG/configs/motifs/panoptic_fpn_r50_fpn_1x_predcls_psg.py
@@ -0,0 +1,241 @@
+_base_ = [
+    '../_base_/models/mask_rcnn_r50_fpn.py',
+    '../_base_/datasets/psg.py',
+    '../_base_/schedules/schedule_1x.py',
+    '../_base_/custom_runtime.py',
+]
+
+find_unused_parameters = True
+dataset_type = 'PanopticSceneGraphDataset'
+
+# HACK:
+object_classes = [
+    'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train',
+    'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign',
+    'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
+    'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag',
+    'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite',
+    'baseball bat', 'baseball glove', 'skateboard', 'surfboard',
+    'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon',
+    'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot',
+    'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant',
+    'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote',
+    'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink',
+    'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear',
+    'hair drier', 'toothbrush', 'banner', 'blanket', 'bridge', 'cardboard',
+    'counter', 'curtain', 'door-stuff', 'floor-wood', 'flower', 'fruit',
+    'gravel', 'house', 'light', 'mirror-stuff', 'net', 'pillow', 'platform',
+    'playingfield', 'railroad', 'river', 'road', 'roof', 'sand', 'sea',
+    'shelf', 'snow', 'stairs', 'tent', 'towel', 'wall-brick', 'wall-stone',
+    'wall-tile', 'wall-wood', 'water-other', 'window-blind', 'window-other',
+    'tree-merged', 'fence-merged', 'ceiling-merged', 'sky-other-merged',
+    'cabinet-merged', 'table-merged', 'floor-other-merged', 'pavement-merged',
+    'mountain-merged', 'grass-merged', 'dirt-merged', 'paper-merged',
+    'food-other-merged', 'building-other-merged', 'rock-merged',
+    'wall-other-merged', 'rug-merged'
+]
+
+predicate_classes = [
+    'over',
+    'in front of',
+    'beside',
+    'on',
+    'in',
+    'attached to',
+    'hanging from',
+    'on back of',
+    'falling off',
+    'going down',
+    'painted on',
+    'walking on',
+    'running on',
+    'crossing',
+    'standing on',
+    'lying on',
+    'sitting on',
+    'flying over',
+    'jumping over',
+    'jumping from',
+    'wearing',
+    'holding',
+    'carrying',
+    'looking at',
+    'guiding',
+    'kissing',
+    'eating',
+    'drinking',
+    'feeding',
+    'biting',
+    'catching',
+    'picking',
+    'playing with',
+    'chasing',
+    'climbing',
+    'cleaning',
+    'playing',
+    'touching',
+    'pushing',
+    'pulling',
+    'opening',
+    'cooking',
+    'talking to',
+    'throwing',
+    'slicing',
+    'driving',
+    'riding',
+    'parked on',
+    'driving on',
+    'about to hit',
+    'kicking',
+    'swinging',
+    'entering',
+    'exiting',
+    'enclosing',
+    'leaning on',
+]
+
+model = dict(
+    type='SceneGraphPanopticFPN',
+    semantic_head=dict(
+        type='PanopticFPNHead',
+        num_things_classes=80,
+        num_stuff_classes=53,
+        in_channels=256,
+        inner_channels=128,
+        start_level=0,
+        end_level=4,
+        norm_cfg=dict(type='GN', num_groups=32, requires_grad=True),
+        conv_cfg=None,
+        loss_seg=dict(type='CrossEntropyLoss',
+                      ignore_index=255,
+                      loss_weight=0.5),
+    ),
+    panoptic_fusion_head=dict(type='HeuristicFusionHead',
+                              num_things_classes=80,
+                              num_stuff_classes=53),
+    test_cfg=dict(panoptic=dict(
+        score_thr=0.6,
+        max_per_img=100,
+        mask_thr_binary=0.5,
+        mask_overlap=0.5,
+        nms=dict(type='nms', iou_threshold=0.5, class_agnostic=True),
+        stuff_area_limit=4096,
+    )),
+    relation_head=dict(
+        type='MotifHead',
+        object_classes=object_classes,
+        predicate_classes=predicate_classes,
+        num_classes=len(object_classes) + 1,  # with background class
+        num_predicates=len(predicate_classes) + 1,
+        use_bias=False,  # NOTE: whether to use frequency bias
+        head_config=dict(
+            # NOTE: Evaluation type
+            use_gt_box=True,
+            use_gt_label=True,
+            use_vision=True,
+            embed_dim=200,
+            hidden_dim=512,
+            roi_dim=1024,
+            context_pooling_dim=4096,
+            dropout_rate=0.2,
+            context_object_layer=1,
+            context_edge_layer=1,
+            glove_dir='data/glove/',
+            causal_effect_analysis=False,
+        ),
+        bbox_roi_extractor=dict(
+            type='VisualSpatialExtractor',
+            bbox_roi_layer=dict(type='RoIAlign',
+                                output_size=7,
+                                sampling_ratio=2),
+            with_visual_bbox=True,
+            with_visual_mask=False,
+            with_visual_point=False,
+            with_spatial=False,
+            in_channels=256,
+            fc_out_channels=1024,
+            featmap_strides=[4, 8, 16, 32],
+        ),
+        relation_roi_extractor=dict(
+            type='VisualSpatialExtractor',
+            bbox_roi_layer=dict(type='RoIAlign',
+                                output_size=7,
+                                sampling_ratio=2),
+            with_visual_bbox=True,
+            with_visual_mask=False,
+            with_visual_point=False,
+            with_spatial=True,
+            separate_spatial=False,
+            in_channels=256,
+            fc_out_channels=1024,
+            featmap_strides=[4, 8, 16, 32],
+        ),
+        relation_sampler=dict(
+            type='Motif',
+            pos_iou_thr=0.5,
+            require_overlap=False,  # for sgdet training, not require
+            num_sample_per_gt_rel=4,
+            num_rel_per_image=1024,
+            pos_fraction=0.25,
+            # NOTE: To only include overlapping bboxes?
+            test_overlap=False,  # for testing
+        ),
+        loss_object=dict(type='CrossEntropyLoss',
+                         use_sigmoid=False,
+                         loss_weight=1.0),
+        loss_relation=dict(type='CrossEntropyLoss',
+                           use_sigmoid=False,
+                           loss_weight=1.0),
+    ),
+)
+
+custom_hooks = []
+
+# To freeze modules
+freeze_modules = [
+    'backbone',
+    'neck',
+    'rpn_head',
+    'roi_head',
+    'semantic_head',
+    'panoptic_fusion_head',
+]
+
+evaluation = dict(interval=1,
+                  metric='predcls',
+                  relation_mode=True,
+                  classwise=True)
+
+# Change batch size and learning rate
+data = dict(samples_per_gpu=16, )
+# optimizer = dict(lr=0.003)
+optimizer = dict(type='SGD', lr=0.03, momentum=0.9, weight_decay=0.0001)
+optimizer_config = dict(_delete_=True,
+                        grad_clip=dict(max_norm=35, norm_type=2))
+
+lr_config = dict(policy='step',
+                 warmup='linear',
+                 warmup_iters=500,
+                 warmup_ratio=1.0 / 3,
+                 step=[7, 10])
+
+# Log config
+project_name = 'openpsg'
+expt_name = 'motifs_panoptic_fpn_r50_fpn_1x_predcls_psg'
+work_dir = f'./work_dirs/{expt_name}'
+
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(
+            type='WandbLoggerHook',
+            init_kwargs=dict(
+                project=project_name,
+                name=expt_name,
+            ),
+        ),
+    ],
+)
+
+load_from = 'work_dirs/checkpoints/panoptic_fpn_r50_fpn_1x_coco_20210821_101153-9668fd13.pth'
diff --git a/OpenPSG/configs/motifs/panoptic_fpn_r50_fpn_1x_sgdet_psg.py b/OpenPSG/configs/motifs/panoptic_fpn_r50_fpn_1x_sgdet_psg.py
new file mode 100644
index 0000000000000000000000000000000000000000..55586140a9723c83b0b347bbfde042822ae8618b
--- /dev/null
+++ b/OpenPSG/configs/motifs/panoptic_fpn_r50_fpn_1x_sgdet_psg.py
@@ -0,0 +1,44 @@
+_base_ = [
+    './panoptic_fpn_r50_fpn_1x_predcls_psg.py',
+]
+
+model = dict(
+    relation_head=dict(
+        head_config=dict(
+            # NOTE: Evaluation type
+            use_gt_box=False,
+            use_gt_label=False,
+        ), ),
+    roi_head=dict(bbox_head=dict(type='SceneGraphBBoxHead'), ),
+)
+
+evaluation = dict(interval=1,
+                  metric='sgdet',
+                  relation_mode=True,
+                  classwise=True,
+                  iou_thrs=0.5,
+                  detection_method='pan_seg')
+
+# Change batch size and learning rate
+data = dict(samples_per_gpu=8,
+            # workers_per_gpu=2
+            )
+
+# Log config
+project_name = 'openpsg'
+expt_name = 'motifs_panoptic_fpn_r50_fpn_1x_sgdet_psg'
+work_dir = f'./work_dirs/{expt_name}'
+
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(
+            type='WandbLoggerHook',
+            init_kwargs=dict(
+                project=project_name,
+                name=expt_name,
+            ),
+        ),
+    ],
+)
diff --git a/OpenPSG/configs/psgformer/psgformer_r101_psg.py b/OpenPSG/configs/psgformer/psgformer_r101_psg.py
new file mode 100644
index 0000000000000000000000000000000000000000..7055248f2307ca9b32f7efe3c6a65f118019a0c7
--- /dev/null
+++ b/OpenPSG/configs/psgformer/psgformer_r101_psg.py
@@ -0,0 +1,16 @@
+_base_ = './psgformer_r50_psg.py'
+
+model = dict(backbone=dict(
+    depth=101,
+    init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101')))
+
+# learning policy
+lr_config = dict(policy='step', step=48)
+runner = dict(type='EpochBasedRunner', max_epochs=60)
+
+project_name = 'psgformer'
+expt_name = 'psgformer_r101_psg'
+work_dir = f'./work_dirs/{expt_name}'
+checkpoint_config = dict(interval=12, max_keep_ckpts=10)
+
+load_from = './work_dirs/checkpoints/detr4psgformer_r101.pth'
diff --git a/OpenPSG/configs/psgformer/psgformer_r50.py b/OpenPSG/configs/psgformer/psgformer_r50.py
new file mode 100644
index 0000000000000000000000000000000000000000..31f77e61bf46c57f8b064ca94d6a5d35b8008411
--- /dev/null
+++ b/OpenPSG/configs/psgformer/psgformer_r50.py
@@ -0,0 +1,96 @@
+model = dict(
+    type='PSGTr',
+    backbone=dict(type='ResNet',
+                  depth=50,
+                  num_stages=4,
+                  out_indices=(0, 1, 2, 3),
+                  frozen_stages=1,
+                  norm_cfg=dict(type='BN', requires_grad=False),
+                  norm_eval=True,
+                  style='pytorch',
+                  init_cfg=dict(type='Pretrained',
+                                checkpoint='torchvision://resnet50')),
+    bbox_head=dict(
+        type='PSGFormerHead',
+        num_classes=80,
+        num_relations=117,
+        in_channels=2048,
+        transformer=dict(
+            type='DualTransformer',
+            encoder=dict(type='DetrTransformerEncoder',
+                         num_layers=6,
+                         transformerlayers=dict(
+                             type='BaseTransformerLayer',
+                             attn_cfgs=[
+                                 dict(type='MultiheadAttention',
+                                      embed_dims=256,
+                                      num_heads=8,
+                                      dropout=0.1)
+                             ],
+                             feedforward_channels=2048,
+                             ffn_dropout=0.1,
+                             operation_order=('self_attn', 'norm', 'ffn',
+                                              'norm'))),
+            decoder1=dict(type='DetrTransformerDecoder',
+                          return_intermediate=True,
+                          num_layers=6,
+                          transformerlayers=dict(
+                              type='DetrTransformerDecoderLayer',
+                              attn_cfgs=dict(type='MultiheadAttention',
+                                             embed_dims=256,
+                                             num_heads=8,
+                                             dropout=0.1),
+                              feedforward_channels=2048,
+                              ffn_dropout=0.1,
+                              operation_order=('self_attn', 'norm',
+                                               'cross_attn', 'norm', 'ffn',
+                                               'norm'))),
+            decoder2=dict(type='DetrTransformerDecoder',
+                          return_intermediate=True,
+                          num_layers=6,
+                          transformerlayers=dict(
+                              type='DetrTransformerDecoderLayer',
+                              attn_cfgs=dict(type='MultiheadAttention',
+                                             embed_dims=256,
+                                             num_heads=8,
+                                             dropout=0.1),
+                              feedforward_channels=2048,
+                              ffn_dropout=0.1,
+                              operation_order=('self_attn', 'norm',
+                                               'cross_attn', 'norm', 'ffn',
+                                               'norm'))),
+        ),
+        positional_encoding=dict(type='SinePositionalEncoding',
+                                 num_feats=128,
+                                 normalize=True),
+        rel_loss_cls=dict(type='CrossEntropyLoss',
+                          use_sigmoid=False,
+                          loss_weight=2.0,
+                          class_weight=1.0),
+        sub_id_loss=dict(type='MultilabelCrossEntropy', loss_weight=2.0),
+        obj_id_loss=dict(type='MultilabelCrossEntropy', loss_weight=2.0),
+        loss_cls=dict(type='CrossEntropyLoss',
+                      use_sigmoid=False,
+                      loss_weight=4.0,
+                      class_weight=1.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=3.0),
+        loss_iou=dict(type='GIoULoss', loss_weight=2.0),
+        focal_loss=dict(type='BCEFocalLoss', loss_weight=1.0),
+        dice_loss=dict(type='psgtrDiceLoss', loss_weight=1.0)),
+    # training and testing settings
+    train_cfg=dict(id_assigner=dict(type='IdMatcher',
+                                    sub_id_cost=dict(type='ClassificationCost',
+                                                     weight=1.),
+                                    obj_id_cost=dict(type='ClassificationCost',
+                                                     weight=1.),
+                                    r_cls_cost=dict(type='ClassificationCost',
+                                                    weight=1.)),
+                   bbox_assigner=dict(type='HungarianAssigner',
+                                      cls_cost=dict(type='ClassificationCost',
+                                                    weight=4.0),
+                                      reg_cost=dict(type='BBoxL1Cost',
+                                                    weight=3.0),
+                                      iou_cost=dict(type='IoUCost',
+                                                    iou_mode='giou',
+                                                    weight=2.0))),
+    test_cfg=dict(max_per_img=100))
diff --git a/OpenPSG/configs/psgformer/psgformer_r50_psg.py b/OpenPSG/configs/psgformer/psgformer_r50_psg.py
new file mode 100644
index 0000000000000000000000000000000000000000..6452d39335427fe40de8c8a869dedeb5992da2f9
--- /dev/null
+++ b/OpenPSG/configs/psgformer/psgformer_r50_psg.py
@@ -0,0 +1,244 @@
+_base_ = [
+    './psgformer_r50.py', '../_base_/datasets/psg.py',
+    '../_base_/custom_runtime.py'
+]
+
+find_unused_parameters = True
+
+custom_imports = dict(imports=[
+    'openpsg.models.frameworks.psgtr', 'openpsg.models.losses.seg_losses',
+    'openpsg.models.frameworks.dual_transformer',
+    'openpsg.models.relation_heads.psgformer_head', 'openpsg.datasets',
+    'openpsg.datasets.pipelines.loading',
+    'openpsg.datasets.pipelines.rel_randomcrop',
+    'openpsg.models.relation_heads.approaches.matcher', 'openpsg.utils'
+],
+                      allow_failed_imports=False)
+
+dataset_type = 'PanopticSceneGraphDataset'
+
+# HACK:
+object_classes = [
+    'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train',
+    'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign',
+    'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
+    'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag',
+    'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite',
+    'baseball bat', 'baseball glove', 'skateboard', 'surfboard',
+    'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon',
+    'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot',
+    'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant',
+    'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote',
+    'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink',
+    'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear',
+    'hair drier', 'toothbrush', 'banner', 'blanket', 'bridge', 'cardboard',
+    'counter', 'curtain', 'door-stuff', 'floor-wood', 'flower', 'fruit',
+    'gravel', 'house', 'light', 'mirror-stuff', 'net', 'pillow', 'platform',
+    'playingfield', 'railroad', 'river', 'road', 'roof', 'sand', 'sea',
+    'shelf', 'snow', 'stairs', 'tent', 'towel', 'wall-brick', 'wall-stone',
+    'wall-tile', 'wall-wood', 'water-other', 'window-blind', 'window-other',
+    'tree-merged', 'fence-merged', 'ceiling-merged', 'sky-other-merged',
+    'cabinet-merged', 'table-merged', 'floor-other-merged', 'pavement-merged',
+    'mountain-merged', 'grass-merged', 'dirt-merged', 'paper-merged',
+    'food-other-merged', 'building-other-merged', 'rock-merged',
+    'wall-other-merged', 'rug-merged'
+]
+
+predicate_classes = [
+    'over',
+    'in front of',
+    'beside',
+    'on',
+    'in',
+    'attached to',
+    'hanging from',
+    'on back of',
+    'falling off',
+    'going down',
+    'painted on',
+    'walking on',
+    'running on',
+    'crossing',
+    'standing on',
+    'lying on',
+    'sitting on',
+    'flying over',
+    'jumping over',
+    'jumping from',
+    'wearing',
+    'holding',
+    'carrying',
+    'looking at',
+    'guiding',
+    'kissing',
+    'eating',
+    'drinking',
+    'feeding',
+    'biting',
+    'catching',
+    'picking',
+    'playing with',
+    'chasing',
+    'climbing',
+    'cleaning',
+    'playing',
+    'touching',
+    'pushing',
+    'pulling',
+    'opening',
+    'cooking',
+    'talking to',
+    'throwing',
+    'slicing',
+    'driving',
+    'riding',
+    'parked on',
+    'driving on',
+    'about to hit',
+    'kicking',
+    'swinging',
+    'entering',
+    'exiting',
+    'enclosing',
+    'leaning on',
+]
+
+model = dict(bbox_head=dict(
+    num_classes=len(object_classes),
+    num_relations=len(predicate_classes),
+    object_classes=object_classes,
+    predicate_classes=predicate_classes,
+    num_obj_query=100,
+    num_rel_query=100,
+), )
+
+img_norm_cfg = dict(mean=[123.675, 116.28, 103.53],
+                    std=[58.395, 57.12, 57.375],
+                    to_rgb=True)
+# train_pipeline, NOTE the img_scale and the Pad's size_divisor is different
+# from the default setting in mmdet.
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadPanopticSceneGraphAnnotations',
+         with_bbox=True,
+         with_rel=True,
+         with_mask=True,
+         with_seg=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(
+        type='AutoAugment',
+        policies=[
+            [
+                dict(type='Resize',
+                     img_scale=[(480, 1333), (512, 1333), (544, 1333),
+                                (576, 1333), (608, 1333), (640, 1333),
+                                (672, 1333), (704, 1333), (736, 1333),
+                                (768, 1333), (800, 1333)],
+                     multiscale_mode='value',
+                     keep_ratio=True)
+            ],
+            [
+                dict(type='Resize',
+                     img_scale=[(400, 1333), (500, 1333), (600, 1333)],
+                     multiscale_mode='value',
+                     keep_ratio=True),
+                dict(type='RelRandomCrop',
+                     crop_type='absolute_range',
+                     crop_size=(384, 600),
+                     allow_negative_crop=False),  # no empty relations
+                dict(type='Resize',
+                     img_scale=[(480, 1333), (512, 1333), (544, 1333),
+                                (576, 1333), (608, 1333), (640, 1333),
+                                (672, 1333), (704, 1333), (736, 1333),
+                                (768, 1333), (800, 1333)],
+                     multiscale_mode='value',
+                     override=True,
+                     keep_ratio=True)
+            ]
+        ]),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=1),
+    dict(type='RelsFormatBundle'),
+    dict(type='Collect',
+         keys=['img', 'gt_bboxes', 'gt_labels', 'gt_rels', 'gt_masks'])
+]
+# test_pipeline, NOTE the Pad's size_divisor is different from the default
+# setting (size_divisor=32). While there is little effect on the performance
+# whether we use the default setting or use size_divisor=1.
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadSceneGraphAnnotations', with_bbox=True, with_rel=True),
+    dict(type='MultiScaleFlipAug',
+         img_scale=(1333, 800),
+         flip=False,
+         transforms=[
+             dict(type='Resize', keep_ratio=True),
+             dict(type='RandomFlip'),
+             dict(type='Normalize', **img_norm_cfg),
+             dict(type='Pad', size_divisor=1),
+             dict(type='ImageToTensor', keys=['img']),
+             dict(type='ToTensor', keys=['gt_bboxes', 'gt_labels']),
+             dict(type='ToDataContainer',
+                  fields=(dict(key='gt_bboxes'), dict(key='gt_labels'))),
+             dict(type='Collect', keys=['img']),
+         ])
+]
+
+evaluation = dict(
+    interval=1,
+    metric='sgdet',
+    relation_mode=True,
+    classwise=True,
+    iou_thrs=0.5,
+    detection_method='pan_seg',
+)
+
+data = dict(samples_per_gpu=1,
+            workers_per_gpu=2,
+            train=dict(pipeline=train_pipeline),
+            val=dict(pipeline=test_pipeline),
+            test=dict(pipeline=test_pipeline))
+# optimizer
+optimizer = dict(
+    type='AdamW',
+    lr=0.001,
+    weight_decay=0.0001,
+    paramwise_cfg=dict(
+        custom_keys={
+            'backbone': dict(lr_mult=0.1, decay_mult=1.0),
+            'transformer.encoder': dict(lr_mult=0.1, decay_mult=1.0),
+            'transformer.decoder1': dict(lr_mult=0.1, decay_mult=1.0),
+            'obj_query_embed': dict(lr_mult=0.1, decay_mult=1.0),
+            'input_proj': dict(lr_mult=0.1, decay_mult=1.0),
+            'class_embed': dict(lr_mult=0.1, decay_mult=1.0),
+            'box_embed': dict(lr_mult=0.1, decay_mult=1.0),
+            'bbox_attention': dict(lr_mult=0.1, decay_mult=1.0),
+            'mask_head': dict(lr_mult=0.1, decay_mult=1.0),
+        }))
+
+optimizer_config = dict(grad_clip=dict(max_norm=0.1, norm_type=2))
+
+# learning policy
+lr_config = dict(policy='step', step=40)
+runner = dict(type='EpochBasedRunner', max_epochs=60)
+
+project_name = 'psgformer'
+expt_name = 'psgformer_r50_psg'
+work_dir = f'./work_dirs/{expt_name}'
+checkpoint_config = dict(interval=1, max_keep_ckpts=15)
+
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(
+            type='WandbLoggerHook',
+            init_kwargs=dict(
+                project=project_name,
+                name=expt_name,
+            ),
+        )
+    ],
+)
+
+load_from = './work_dirs/checkpoints/detr4psgformer_r50.pth'
diff --git a/OpenPSG/configs/psgformer/psgformer_r50_psg_inference.py b/OpenPSG/configs/psgformer/psgformer_r50_psg_inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..37bebaf42627dc17503986567b18fc6a9770f427
--- /dev/null
+++ b/OpenPSG/configs/psgformer/psgformer_r50_psg_inference.py
@@ -0,0 +1,31 @@
+_base_ = [
+    './psgformer_r50_psg.py'
+]
+
+img_norm_cfg = dict(mean=[123.675, 116.28, 103.53],
+                    std=[58.395, 57.12, 57.375],
+                    to_rgb=True)
+pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            # NOTE: Do not change the img to DC.
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+
+        ],
+    ),
+]
+
+data = dict(
+    test=dict(
+        pipeline=pipeline,
+    ),
+)
\ No newline at end of file
diff --git a/OpenPSG/configs/psgtr/psgtr_r101_psg.py b/OpenPSG/configs/psgtr/psgtr_r101_psg.py
new file mode 100644
index 0000000000000000000000000000000000000000..916dc05998c72b83fb5c3221be10af3f5a7f7827
--- /dev/null
+++ b/OpenPSG/configs/psgtr/psgtr_r101_psg.py
@@ -0,0 +1,231 @@
+_base_ = [
+    '../_base_/models/psgtr_r101.py', '../_base_/datasets/psg.py',
+    '../_base_/custom_runtime.py'
+]
+
+custom_imports = dict(imports=[
+    'openpsg.models.frameworks.psgtr', 'openpsg.models.losses.seg_losses',
+    'openpsg.models.relation_heads.psgtr_head', 'openpsg.datasets',
+    'openpsg.datasets.pipelines.loading',
+    'openpsg.datasets.pipelines.rel_randomcrop',
+    'openpsg.models.relation_heads.approaches.matcher', 'openpsg.utils'
+],
+                      allow_failed_imports=False)
+
+dataset_type = 'PanopticSceneGraphDataset'
+
+# HACK:
+object_classes = [
+    'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train',
+    'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign',
+    'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
+    'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag',
+    'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite',
+    'baseball bat', 'baseball glove', 'skateboard', 'surfboard',
+    'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon',
+    'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot',
+    'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant',
+    'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote',
+    'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink',
+    'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear',
+    'hair drier', 'toothbrush', 'banner', 'blanket', 'bridge', 'cardboard',
+    'counter', 'curtain', 'door-stuff', 'floor-wood', 'flower', 'fruit',
+    'gravel', 'house', 'light', 'mirror-stuff', 'net', 'pillow', 'platform',
+    'playingfield', 'railroad', 'river', 'road', 'roof', 'sand', 'sea',
+    'shelf', 'snow', 'stairs', 'tent', 'towel', 'wall-brick', 'wall-stone',
+    'wall-tile', 'wall-wood', 'water-other', 'window-blind', 'window-other',
+    'tree-merged', 'fence-merged', 'ceiling-merged', 'sky-other-merged',
+    'cabinet-merged', 'table-merged', 'floor-other-merged', 'pavement-merged',
+    'mountain-merged', 'grass-merged', 'dirt-merged', 'paper-merged',
+    'food-other-merged', 'building-other-merged', 'rock-merged',
+    'wall-other-merged', 'rug-merged'
+]
+
+predicate_classes = [
+    'over',
+    'in front of',
+    'beside',
+    'on',
+    'in',
+    'attached to',
+    'hanging from',
+    'on back of',
+    'falling off',
+    'going down',
+    'painted on',
+    'walking on',
+    'running on',
+    'crossing',
+    'standing on',
+    'lying on',
+    'sitting on',
+    'flying over',
+    'jumping over',
+    'jumping from',
+    'wearing',
+    'holding',
+    'carrying',
+    'looking at',
+    'guiding',
+    'kissing',
+    'eating',
+    'drinking',
+    'feeding',
+    'biting',
+    'catching',
+    'picking',
+    'playing with',
+    'chasing',
+    'climbing',
+    'cleaning',
+    'playing',
+    'touching',
+    'pushing',
+    'pulling',
+    'opening',
+    'cooking',
+    'talking to',
+    'throwing',
+    'slicing',
+    'driving',
+    'riding',
+    'parked on',
+    'driving on',
+    'about to hit',
+    'kicking',
+    'swinging',
+    'entering',
+    'exiting',
+    'enclosing',
+    'leaning on',
+]
+
+model = dict(bbox_head=dict(
+    num_classes=len(object_classes),
+    num_relations=len(predicate_classes),
+    object_classes=object_classes,
+    predicate_classes=predicate_classes,
+    use_mask=True,
+    num_query=100,
+), )
+
+img_norm_cfg = dict(mean=[123.675, 116.28, 103.53],
+                    std=[58.395, 57.12, 57.375],
+                    to_rgb=True)
+# train_pipeline, NOTE the img_scale and the Pad's size_divisor is different
+# from the default setting in mmdet.
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadPanopticSceneGraphAnnotations',
+         with_bbox=True,
+         with_rel=True,
+         with_mask=True,
+         with_seg=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(
+        type='AutoAugment',
+        policies=[
+            [
+                dict(type='Resize',
+                     img_scale=[(480, 1333), (512, 1333), (544, 1333),
+                                (576, 1333), (608, 1333), (640, 1333),
+                                (672, 1333), (704, 1333), (736, 1333),
+                                (768, 1333), (800, 1333)],
+                     multiscale_mode='value',
+                     keep_ratio=True)
+            ],
+            [
+                dict(type='Resize',
+                     img_scale=[(400, 1333), (500, 1333), (600, 1333)],
+                     multiscale_mode='value',
+                     keep_ratio=True),
+                dict(type='RelRandomCrop',
+                     crop_type='absolute_range',
+                     crop_size=(384, 600),
+                     allow_negative_crop=False),  # no empty relations
+                dict(type='Resize',
+                     img_scale=[(480, 1333), (512, 1333), (544, 1333),
+                                (576, 1333), (608, 1333), (640, 1333),
+                                (672, 1333), (704, 1333), (736, 1333),
+                                (768, 1333), (800, 1333)],
+                     multiscale_mode='value',
+                     override=True,
+                     keep_ratio=True)
+            ]
+        ]),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=1),
+    dict(type='RelsFormatBundle'),
+    dict(type='Collect',
+         keys=['img', 'gt_bboxes', 'gt_labels', 'gt_rels', 'gt_masks'])
+]
+# test_pipeline, NOTE the Pad's size_divisor is different from the default
+# setting (size_divisor=32). While there is little effect on the performance
+# whether we use the default setting or use size_divisor=1.
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    # dict(type='LoadSceneGraphAnnotations', with_bbox=True, with_rel=True),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=1),
+            dict(type='ImageToTensor', keys=['img']),
+            # dict(type='ToTensor', keys=['gt_bboxes', 'gt_labels']),
+            # dict(type='ToDataContainer', fields=(dict(key='gt_bboxes'), dict(key='gt_labels'))),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+
+evaluation = dict(
+    interval=1,
+    metric='sgdet',
+    relation_mode=True,
+    classwise=True,
+    iou_thrs=0.5,
+    detection_method='pan_seg',
+)
+
+data = dict(samples_per_gpu=1,
+            workers_per_gpu=2,
+            train=dict(pipeline=train_pipeline),
+            val=dict(pipeline=test_pipeline),
+            test=dict(pipeline=test_pipeline))
+# optimizer
+optimizer = dict(
+    type='AdamW',
+    lr=0.0001,
+    weight_decay=0.0001,
+    paramwise_cfg=dict(custom_keys={
+        'backbone': dict(lr_mult=0.1, decay_mult=1.0),
+    }))
+optimizer_config = dict(grad_clip=dict(max_norm=0.1, norm_type=2))
+
+# learning policy
+lr_config = dict(policy='step', step=40)
+runner = dict(type='EpochBasedRunner', max_epochs=60)
+
+project_name = 'psgtr'
+expt_name = 'psgtr_r101_psg'
+work_dir = f'./work_dirs/{expt_name}'
+checkpoint_config = dict(interval=2, max_keep_ckpts=10)
+
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(
+            type='WandbLoggerHook',
+            init_kwargs=dict(
+                project=project_name,
+                name=expt_name,
+            ),
+        )
+    ],
+)
+
+load_from = 'work_dirs/checkpoints/detr_pan_r101.pth'
diff --git a/OpenPSG/configs/psgtr/psgtr_r50.py b/OpenPSG/configs/psgtr/psgtr_r50.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8827bbb9461a34a9d894c2aee9fb6286503898d
--- /dev/null
+++ b/OpenPSG/configs/psgtr/psgtr_r50.py
@@ -0,0 +1,82 @@
+model = dict(
+    type='PSGTr',
+    backbone=dict(type='ResNet',
+                  depth=50,
+                  num_stages=4,
+                  out_indices=(0, 1, 2, 3),
+                  frozen_stages=1,
+                  norm_cfg=dict(type='BN', requires_grad=False),
+                  norm_eval=True,
+                  style='pytorch',
+                  init_cfg=dict(type='Pretrained',
+                                checkpoint='torchvision://resnet50')),
+    bbox_head=dict(type='PSGTrHead',
+                   num_classes=80,
+                   num_relations=117,
+                   in_channels=2048,
+                   transformer=dict(
+                       type='Transformer',
+                       encoder=dict(type='DetrTransformerEncoder',
+                                    num_layers=6,
+                                    transformerlayers=dict(
+                                        type='BaseTransformerLayer',
+                                        attn_cfgs=[
+                                            dict(type='MultiheadAttention',
+                                                 embed_dims=256,
+                                                 num_heads=8,
+                                                 dropout=0.1)
+                                        ],
+                                        feedforward_channels=2048,
+                                        ffn_dropout=0.1,
+                                        operation_order=('self_attn', 'norm',
+                                                         'ffn', 'norm'))),
+                       decoder=dict(
+                           type='DetrTransformerDecoder',
+                           return_intermediate=True,
+                           num_layers=6,
+                           transformerlayers=dict(
+                               type='DetrTransformerDecoderLayer',
+                               attn_cfgs=dict(type='MultiheadAttention',
+                                              embed_dims=256,
+                                              num_heads=8,
+                                              dropout=0.1),
+                               feedforward_channels=2048,
+                               ffn_dropout=0.1,
+                               operation_order=('self_attn', 'norm',
+                                                'cross_attn', 'norm', 'ffn',
+                                                'norm')),
+                       )),
+                   positional_encoding=dict(type='SinePositionalEncoding',
+                                            num_feats=128,
+                                            normalize=True),
+                   sub_loss_cls=dict(type='CrossEntropyLoss',
+                                     use_sigmoid=False,
+                                     loss_weight=1.0,
+                                     class_weight=1.0),
+                   sub_loss_bbox=dict(type='L1Loss', loss_weight=5.0),
+                   sub_loss_iou=dict(type='GIoULoss', loss_weight=2.0),
+                   sub_focal_loss=dict(type='BCEFocalLoss', loss_weight=2.0),
+                   sub_dice_loss=dict(type='psgtrDiceLoss', loss_weight=2.0),
+                   obj_loss_cls=dict(type='CrossEntropyLoss',
+                                     use_sigmoid=False,
+                                     loss_weight=1.0,
+                                     class_weight=1.0),
+                   obj_loss_bbox=dict(type='L1Loss', loss_weight=5.0),
+                   obj_loss_iou=dict(type='GIoULoss', loss_weight=2.0),
+                   obj_focal_loss=dict(type='BCEFocalLoss', loss_weight=2.0),
+                   obj_dice_loss=dict(type='psgtrDiceLoss', loss_weight=2.0),
+                   rel_loss_cls=dict(type='CrossEntropyLoss',
+                                     use_sigmoid=False,
+                                     loss_weight=2.0,
+                                     class_weight=1.0)),
+    # training and testing settings
+    train_cfg=dict(assigner=dict(
+        type='HTriMatcher',
+        s_cls_cost=dict(type='ClassificationCost', weight=1.),
+        s_reg_cost=dict(type='BBoxL1Cost', weight=5.0),
+        s_iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0),
+        o_cls_cost=dict(type='ClassificationCost', weight=1.),
+        o_reg_cost=dict(type='BBoxL1Cost', weight=5.0),
+        o_iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0),
+        r_cls_cost=dict(type='ClassificationCost', weight=2.))),
+    test_cfg=dict(max_per_img=100))
diff --git a/OpenPSG/configs/psgtr/psgtr_r50_psg.py b/OpenPSG/configs/psgtr/psgtr_r50_psg.py
new file mode 100644
index 0000000000000000000000000000000000000000..6440149836d4eadd912b5c00412e247ee4637e68
--- /dev/null
+++ b/OpenPSG/configs/psgtr/psgtr_r50_psg.py
@@ -0,0 +1,233 @@
+_base_ = [
+    '../_base_/models/psgtr_r50.py', '../_base_/datasets/psg.py',
+    '../_base_/custom_runtime.py'
+]
+
+custom_imports = dict(imports=[
+    'openpsg.models.frameworks.psgtr', 'openpsg.models.losses.seg_losses',
+    'openpsg.models.relation_heads.psgtr_head', 'openpsg.datasets',
+    'openpsg.datasets.pipelines.loading',
+    'openpsg.datasets.pipelines.rel_randomcrop',
+    'openpsg.models.relation_heads.approaches.matcher', 'openpsg.utils'
+],
+                      allow_failed_imports=False)
+
+dataset_type = 'PanopticSceneGraphDataset'
+
+# HACK:
+object_classes = [
+    'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train',
+    'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign',
+    'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
+    'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag',
+    'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite',
+    'baseball bat', 'baseball glove', 'skateboard', 'surfboard',
+    'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon',
+    'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot',
+    'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant',
+    'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote',
+    'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink',
+    'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear',
+    'hair drier', 'toothbrush', 'banner', 'blanket', 'bridge', 'cardboard',
+    'counter', 'curtain', 'door-stuff', 'floor-wood', 'flower', 'fruit',
+    'gravel', 'house', 'light', 'mirror-stuff', 'net', 'pillow', 'platform',
+    'playingfield', 'railroad', 'river', 'road', 'roof', 'sand', 'sea',
+    'shelf', 'snow', 'stairs', 'tent', 'towel', 'wall-brick', 'wall-stone',
+    'wall-tile', 'wall-wood', 'water-other', 'window-blind', 'window-other',
+    'tree-merged', 'fence-merged', 'ceiling-merged', 'sky-other-merged',
+    'cabinet-merged', 'table-merged', 'floor-other-merged', 'pavement-merged',
+    'mountain-merged', 'grass-merged', 'dirt-merged', 'paper-merged',
+    'food-other-merged', 'building-other-merged', 'rock-merged',
+    'wall-other-merged', 'rug-merged'
+]
+
+predicate_classes = [
+    'over',
+    'in front of',
+    'beside',
+    'on',
+    'in',
+    'attached to',
+    'hanging from',
+    'on back of',
+    'falling off',
+    'going down',
+    'painted on',
+    'walking on',
+    'running on',
+    'crossing',
+    'standing on',
+    'lying on',
+    'sitting on',
+    'flying over',
+    'jumping over',
+    'jumping from',
+    'wearing',
+    'holding',
+    'carrying',
+    'looking at',
+    'guiding',
+    'kissing',
+    'eating',
+    'drinking',
+    'feeding',
+    'biting',
+    'catching',
+    'picking',
+    'playing with',
+    'chasing',
+    'climbing',
+    'cleaning',
+    'playing',
+    'touching',
+    'pushing',
+    'pulling',
+    'opening',
+    'cooking',
+    'talking to',
+    'throwing',
+    'slicing',
+    'driving',
+    'riding',
+    'parked on',
+    'driving on',
+    'about to hit',
+    'kicking',
+    'swinging',
+    'entering',
+    'exiting',
+    'enclosing',
+    'leaning on',
+]
+
+model = dict(bbox_head=dict(
+    num_classes=len(object_classes),
+    num_relations=len(predicate_classes),
+    object_classes=object_classes,
+    predicate_classes=predicate_classes,
+    use_mask=True,
+    num_query=100,
+), )
+
+img_norm_cfg = dict(mean=[123.675, 116.28, 103.53],
+                    std=[58.395, 57.12, 57.375],
+                    to_rgb=True)
+# train_pipeline, NOTE the img_scale and the Pad's size_divisor is different
+# from the default setting in mmdet.
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadPanopticSceneGraphAnnotations',
+         with_bbox=True,
+         with_rel=True,
+         with_mask=True,
+         with_seg=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(
+        type='AutoAugment',
+        policies=[
+            [
+                dict(type='Resize',
+                     img_scale=[(480, 1333), (512, 1333), (544, 1333),
+                                (576, 1333), (608, 1333), (640, 1333),
+                                (672, 1333), (704, 1333), (736, 1333),
+                                (768, 1333), (800, 1333)],
+                     multiscale_mode='value',
+                     keep_ratio=True)
+            ],
+            [
+                dict(type='Resize',
+                     img_scale=[(400, 1333), (500, 1333), (600, 1333)],
+                     multiscale_mode='value',
+                     keep_ratio=True),
+                dict(type='RelRandomCrop',
+                     crop_type='absolute_range',
+                     crop_size=(384, 600),
+                     allow_negative_crop=False),  # no empty relations
+                dict(type='Resize',
+                     img_scale=[(480, 1333), (512, 1333), (544, 1333),
+                                (576, 1333), (608, 1333), (640, 1333),
+                                (672, 1333), (704, 1333), (736, 1333),
+                                (768, 1333), (800, 1333)],
+                     multiscale_mode='value',
+                     override=True,
+                     keep_ratio=True)
+            ]
+        ]),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=1),
+    dict(type='RelsFormatBundle'),
+    dict(type='Collect',
+         keys=['img', 'gt_bboxes', 'gt_labels', 'gt_rels', 'gt_masks'])
+]
+# test_pipeline, NOTE the Pad's size_divisor is different from the default
+# setting (size_divisor=32). While there is little effect on the performance
+# whether we use the default setting or use size_divisor=1.
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    # dict(type='LoadSceneGraphAnnotations', with_bbox=True, with_rel=True),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=1),
+            dict(type='ImageToTensor', keys=['img']),
+            # dict(type='ToTensor', keys=['gt_bboxes', 'gt_labels']),
+            # dict(type='ToDataContainer', fields=(dict(key='gt_bboxes'), dict(key='gt_labels'))),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+
+evaluation = dict(
+    interval=1,
+    metric='sgdet',
+    relation_mode=True,
+    classwise=True,
+    iou_thrs=0.5,
+    detection_method='pan_seg',
+)
+
+data = dict(samples_per_gpu=1,
+            workers_per_gpu=2,
+            train=dict(pipeline=train_pipeline),
+            val=dict(pipeline=test_pipeline),
+            test=dict(pipeline=test_pipeline))
+# optimizer
+optimizer = dict(
+    type='AdamW',
+    lr=0.0001,
+    weight_decay=0.0001,
+    paramwise_cfg=dict(custom_keys={
+        'backbone': dict(lr_mult=0.1, decay_mult=1.0),
+    }))
+optimizer_config = dict(grad_clip=dict(max_norm=0.1, norm_type=2))
+
+# learning policy
+lr_config = dict(policy='step', step=40)
+runner = dict(type='EpochBasedRunner', max_epochs=60)
+
+project_name = 'psgformer'
+expt_name = 'psgtr_r50_psg_0.5_scale_mask'
+work_dir = f'./work_dirs/{expt_name}'
+checkpoint_config = dict(interval=2, max_keep_ckpts=10)
+
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook'),
+        dict(
+            type='WandbLoggerHook',
+            init_kwargs=dict(
+                project=project_name,
+                name=expt_name,
+                # config=work_dir + "/cfg.yaml"
+            ),
+        )
+    ],
+)
+
+load_from = 'work_dirs/checkpoints/detr_pan_r50.pth'
diff --git a/OpenPSG/configs/psgtr/psgtr_r50_psg_inference.py b/OpenPSG/configs/psgtr/psgtr_r50_psg_inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d32a233c2690c53b40a60a69d10b6fa58d0ea7f
--- /dev/null
+++ b/OpenPSG/configs/psgtr/psgtr_r50_psg_inference.py
@@ -0,0 +1,31 @@
+_base_ = [
+    './psgtr_r50_psg.py'
+]
+
+img_norm_cfg = dict(mean=[123.675, 116.28, 103.53],
+                    std=[58.395, 57.12, 57.375],
+                    to_rgb=True)
+pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            # NOTE: Do not change the img to DC.
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+
+        ],
+    ),
+]
+
+data = dict(
+    test=dict(
+        pipeline=pipeline,
+    ),
+)
\ No newline at end of file
diff --git a/OpenPSG/configs/vctree/panoptic_fpn_r101_fpn_1x_predcls_psg.py b/OpenPSG/configs/vctree/panoptic_fpn_r101_fpn_1x_predcls_psg.py
new file mode 100644
index 0000000000000000000000000000000000000000..faabe0d659a7e1b24b2f58dda644a9a0fe8faf08
--- /dev/null
+++ b/OpenPSG/configs/vctree/panoptic_fpn_r101_fpn_1x_predcls_psg.py
@@ -0,0 +1,28 @@
+_base_ = './panoptic_fpn_r50_fpn_1x_predcls_psg.py'
+
+model = dict(backbone=dict(
+    depth=101,
+    init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101')))
+
+# Log config
+project_name = 'openpsg'
+expt_name = 'vctree_panoptic_fpn_r101_fpn_1x_predcls_psg'
+work_dir = f'./work_dirs/{expt_name}'
+
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+        dict(
+            type='WandbLoggerHook',
+            init_kwargs=dict(
+                project=project_name,
+                name=expt_name,
+                # config=work_dir + "/cfg.yaml"
+            ),
+        ),
+    ],
+)
+
+load_from = 'work_dirs/checkpoints/panoptic_fpn_r101_fpn_1x_coco_20210820_193950-ab9157a2.pth'
diff --git a/OpenPSG/configs/vctree/panoptic_fpn_r101_fpn_1x_sgdet_psg.py b/OpenPSG/configs/vctree/panoptic_fpn_r101_fpn_1x_sgdet_psg.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc49e1368baa36b8fcc2c14a3fb7703e51c854f2
--- /dev/null
+++ b/OpenPSG/configs/vctree/panoptic_fpn_r101_fpn_1x_sgdet_psg.py
@@ -0,0 +1,28 @@
+_base_ = './panoptic_fpn_r50_fpn_1x_sgdet_psg.py'
+
+model = dict(backbone=dict(
+    depth=101,
+    init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101')))
+
+# Log config
+project_name = 'openpsg'
+expt_name = 'vctree_panoptic_fpn_r101_fpn_1x_sgdet_psg'
+work_dir = f'./work_dirs/{expt_name}'
+
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+        dict(
+            type='WandbLoggerHook',
+            init_kwargs=dict(
+                project=project_name,
+                name=expt_name,
+                # config=work_dir + "/cfg.yaml"
+            ),
+        ),
+    ],
+)
+
+load_from = 'work_dirs/checkpoints/panoptic_fpn_r101_fpn_1x_coco_20210820_193950-ab9157a2.pth'
diff --git a/OpenPSG/configs/vctree/panoptic_fpn_r50_fpn_1x_predcls_psg.py b/OpenPSG/configs/vctree/panoptic_fpn_r50_fpn_1x_predcls_psg.py
new file mode 100644
index 0000000000000000000000000000000000000000..e78db15d48d404634713181231bb498ed27b936b
--- /dev/null
+++ b/OpenPSG/configs/vctree/panoptic_fpn_r50_fpn_1x_predcls_psg.py
@@ -0,0 +1,43 @@
+_base_ = [
+    '../motifs/panoptic_fpn_r50_fpn_1x_predcls_psg.py',
+]
+
+model = dict(relation_head=dict(
+    type='VCTreeHead',
+    head_config=dict(
+        # NOTE: Evaluation type
+        use_gt_box=True,
+        use_gt_label=True,
+    ),
+))
+
+evaluation = dict(interval=1,
+                  metric='predcls',
+                  relation_mode=True,
+                  classwise=True)
+
+# Change batch size and learning rate
+data = dict(samples_per_gpu=16,
+            workers_per_gpu=0)  # FIXME: Is this the problem?
+# optimizer = dict(lr=0.001)
+
+# Log config
+project_name = 'openpsg'
+expt_name = 'vctree_panoptic_fpn_r50_fpn_1x_predcls_psg'
+work_dir = f'./work_dirs/{expt_name}'
+
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+        dict(
+            type='WandbLoggerHook',
+            init_kwargs=dict(
+                project=project_name,
+                name=expt_name,
+                # config=work_dir + "/cfg.yaml"
+            ),
+        ),
+    ],
+)
diff --git a/OpenPSG/configs/vctree/panoptic_fpn_r50_fpn_1x_sgdet_psg.py b/OpenPSG/configs/vctree/panoptic_fpn_r50_fpn_1x_sgdet_psg.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0f05d87f47ebc28920183e317aa26d0abb15026
--- /dev/null
+++ b/OpenPSG/configs/vctree/panoptic_fpn_r50_fpn_1x_sgdet_psg.py
@@ -0,0 +1,49 @@
+_base_ = [
+    '../motifs/panoptic_fpn_r50_fpn_1x_predcls_psg.py',
+]
+
+model = dict(
+    relation_head=dict(
+        type='VCTreeHead',
+        head_config=dict(
+            # NOTE: Evaluation type
+            use_gt_box=False,
+            use_gt_label=False,
+        ),
+    ),
+    roi_head=dict(bbox_head=dict(type='SceneGraphBBoxHead'), ),
+)
+
+evaluation = dict(interval=1,
+                  metric='sgdet',
+                  relation_mode=True,
+                  classwise=True,
+                  iou_thrs=0.5,
+                  detection_method='pan_seg')
+
+# Change batch size and learning rate
+data = dict(samples_per_gpu=16,
+            # workers_per_gpu=2
+            )
+# optimizer = dict(lr=0.003)
+
+# Log config
+project_name = 'openpsg'
+expt_name = 'vctree_panoptic_fpn_r50_fpn_1x_sgdet_psg'
+work_dir = f'./work_dirs/{expt_name}'
+
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+        dict(
+            type='WandbLoggerHook',
+            init_kwargs=dict(
+                project=project_name,
+                name=expt_name,
+                # config=work_dir + "/cfg.yaml"
+            ),
+        ),
+    ],
+)
diff --git a/README.md b/README.md
index 24242ef29f59b4de0e9631b0475d05e42dc73a05..56c55de1fab90a8642f9378c9dfef888302d9530 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,8 @@
 ---
 title: OpenPSG
-emoji: 🐠
-colorFrom: green
-colorTo: gray
+emoji: 🖼️🏙️🌄🌉
+colorFrom: yellow
+colorTo: blue
 sdk: gradio
 sdk_version: 3.1.4
 app_file: app.py
diff --git a/app.py b/app.py
index f55e9e6c34fad83380e6d0562df75076cdd3883e..a471833d791aa71a92bad57908e9e8dc7e703dd7 100644
--- a/app.py
+++ b/app.py
@@ -1,15 +1,135 @@
-import numpy as np
+#!/usr/bin/env python
+
+from __future__ import annotations
+
+import argparse
+import os
+import pathlib
+import subprocess
+import tarfile
+
+if os.getenv('SYSTEM') == 'spaces':
+    import mim
+
+    mim.uninstall('mmcv-full', confirm_yes=True)
+    mim.install('mmcv-full==1.5.2', is_yes=True)
+
+    subprocess.call('pip uninstall -y opencv-python'.split())
+    subprocess.call('pip uninstall -y opencv-python-headless'.split())
+    subprocess.call('pip install opencv-python-headless==4.5.5.64'.split())
+
+import cv2
 import gradio as gr
+import numpy as np
+
+from mmdet.apis import init_detector, inference_detector
+from utils import show_result
+import mmcv 
+from mmcv import Config
+import os.path as osp 
+
+DESCRIPTION = '''# OpenPSG
+
+This is an official demo for [OpenPSG](https://github.com/Jingkang50/OpenPSG).
+<img id="overview" alt="overview" src="https://camo.githubusercontent.com/880346b66831a8212074787ba9a2301b4d700bd8f765ca11e4845ac0ab34c230/68747470733a2f2f6c6976652e737461746963666c69636b722e636f6d2f36353533352f35323139333837393637375f373531613465306237395f6b2e6a7067" />
+'''
+FOOTER = '<img id="visitor-badge" src="https://visitor-badge.glitch.me/badge?page_id=c-liangyu.openpsg" alt="visitor badge" />'
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--device', type=str, default='cpu')
+    parser.add_argument('--theme', type=str)
+    parser.add_argument('--share', action='store_true')
+    parser.add_argument('--port', type=int)
+    parser.add_argument('--disable-queue',
+                        dest='enable_queue',
+                        action='store_false')
+    return parser.parse_args()
+
+
+def update_input_image(image: np.ndarray) -> dict:
+    if image is None:
+        return gr.Image.update(value=None)
+    scale = 1500 / max(image.shape[:2])
+    if scale < 1:
+        image = cv2.resize(image, None, fx=scale, fy=scale)
+    return gr.Image.update(value=image)
+
+
+def set_example_image(example: list) -> dict:
+    return gr.Image.update(value=example[0])
+
+
+def infer(model, input_image, num_rel):
+    result = inference_detector(model, input_image)
+    return show_result(input_image,
+                        result,
+                        is_one_stage=True,
+                        num_rel=num_rel,
+                        show=True
+                        )
+
+
+def main():
+    args = parse_args()
+
+    model_ckt ='OpenPSG/checkpoints/epoch_60.pth'
+    cfg = Config.fromfile('OpenPSG/configs/psgtr/psgtr_r50_psg_inference.py')
+
+    model = init_detector(cfg, model_ckt, device=args.device)
+
+    with gr.Blocks(theme=args.theme, css='style.css') as demo:
+        gr.Markdown(DESCRIPTION)
+
+        with gr.Row():
+            with gr.Column():
+                with gr.Row():
+                    input_image = gr.Image(label='Input Image', type='numpy')
+                with gr.Group():
+                    with gr.Row():
+                        num_rel = gr.Slider(
+                            5,
+                            100,
+                            step=5,
+                            value=20,
+                            label='Number of Relations')
+                with gr.Row():
+                    run_button = gr.Button(value='Run')
+                    # prediction_results = gr.Variable()
+            with gr.Column():
+                with gr.Row():
+                    # visualization = gr.Image(label='Result', type='numpy')
+                    result = gr.Gallery(label='Result', type='numpy')
+
+        with gr.Row():
+            paths = sorted(pathlib.Path('images').rglob('*.jpg'))
+            example_images = gr.Dataset(components=[input_image],
+                                        samples=[[path.as_posix()]
+                                                 for path in paths])
+
+        gr.Markdown(FOOTER)
+
+        input_image.change(fn=update_input_image,
+                           inputs=input_image,
+                           outputs=input_image)
+        
+        run_button.click(fn=infer,
+                         inputs=[
+                             model, input_image
+                         ],
+                         outputs=result)
+
+        example_images.click(fn=set_example_image,
+                             inputs=example_images,
+                             outputs=input_image)
+
+    demo.launch(
+        enable_queue=args.enable_queue,
+        server_port=args.port,
+        share=args.share,
+    )
+
 
-def sepia(input_img):
-    sepia_filter = np.array([
-        [0.393, 0.769, 0.189], 
-        [0.349, 0.686, 0.168], 
-        [0.272, 0.534, 0.131]
-    ])
-    sepia_img = input_img.dot(sepia_filter.T)
-    sepia_img /= sepia_img.max()
-    return sepia_img
-
-demo = gr.Interface(sepia, gr.Image(shape=(200, 200)), "image")
-demo.launch(share=True)
\ No newline at end of file
+if __name__ == '__main__':
+    main()
diff --git a/fake_gan.py b/fake_gan.py
new file mode 100644
index 0000000000000000000000000000000000000000..723fa422afdd4a4323fff964b7b48d68315a76e5
--- /dev/null
+++ b/fake_gan.py
@@ -0,0 +1,56 @@
+# another demo
+# https://huggingface.co/spaces/dalle-mini/dalle-mini/blob/21944e2a8508568387951fc66a30e90f1d58819d/app/gradio/app.py
+
+# This demo needs to be run from the repo folder.
+# python demo/fake_gan/run.py
+import os
+import random
+import time
+
+import gradio as gr
+
+
+def fake_gan(count, *args):
+    time.sleep(1)
+    images = [
+        random.choice(
+            [
+                "https://images.unsplash.com/photo-1507003211169-0a1dd7228f2d?ixlib=rb-1.2.1&ixid=MnwxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8&auto=format&fit=crop&w=387&q=80",
+                "https://images.unsplash.com/photo-1554151228-14d9def656e4?ixlib=rb-1.2.1&ixid=MnwxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8&auto=format&fit=crop&w=386&q=80",
+                "https://images.unsplash.com/photo-1542909168-82c3e7fdca5c?ixlib=rb-1.2.1&ixid=MnwxMjA3fDB8MHxzZWFyY2h8MXx8aHVtYW4lMjBmYWNlfGVufDB8fDB8fA%3D%3D&w=1000&q=80",
+                "https://images.unsplash.com/photo-1546456073-92b9f0a8d413?ixlib=rb-1.2.1&ixid=MnwxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8&auto=format&fit=crop&w=387&q=80",
+                "https://images.unsplash.com/photo-1601412436009-d964bd02edbc?ixlib=rb-1.2.1&ixid=MnwxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8&auto=format&fit=crop&w=464&q=80",
+            ]
+        )
+        for _ in range(int(count))
+    ]
+    return images
+
+
+cheetah = os.path.join(os.path.dirname(__file__), "files/cheetah1.jpg")
+
+demo = gr.Interface(
+    fn=fake_gan,
+    inputs=[
+        gr.Number(label="Generation Count"),
+        gr.Image(label="Initial Image (optional)"),
+        gr.Slider(0, 50, 25, label="TV_scale (for smoothness)"),
+        gr.Slider(0, 50, 25, label="Range_Scale (out of range RBG)"),
+        gr.Number(label="Seed"),
+        gr.Number(label="Respacing"),
+    ],
+    outputs=gr.Gallery(label="Generated Images"),
+    title="FD-GAN",
+    description="This is a fake demo of a GAN. In reality, the images are randomly chosen from Unsplash.",
+    examples=[
+        [2, cheetah, 12, None, None, None],
+        [1, cheetah, 2, None, None, None],
+        [4, cheetah, 42, None, None, None],
+        [5, cheetah, 23, None, None, None],
+        [4, cheetah, 11, None, None, None],
+        [3, cheetah, 1, None, None, None],
+    ],
+)
+
+if __name__ == "__main__":
+    demo.launch()
\ No newline at end of file
diff --git a/images/cooking.jpg b/images/cooking.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..5e6a0026995854873875ca7831b0b88e473db837
Binary files /dev/null and b/images/cooking.jpg differ
diff --git a/images/forrest-gump.jpg b/images/forrest-gump.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..1c5b19b774bc3b9f9292aa870f7897c69aae89f4
Binary files /dev/null and b/images/forrest-gump.jpg differ
diff --git a/images/friends.jpg b/images/friends.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..92cb34b3671d620c7034331d3ff67300c686b74a
Binary files /dev/null and b/images/friends.jpg differ
diff --git a/images/mbappe.jpg b/images/mbappe.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..b5f8c42f389696ad2a176c0282c5f705f01e0ca9
Binary files /dev/null and b/images/mbappe.jpg differ
diff --git a/images/messi.jpg b/images/messi.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..bf3e1b4141b376c85df73f4c14c4cac9be858ff3
Binary files /dev/null and b/images/messi.jpg differ
diff --git a/images/neymar-jr-angers-x-psg-160121.jpg b/images/neymar-jr-angers-x-psg-160121.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..2e048c6998a15fa1c2af034ac259522b23d34348
Binary files /dev/null and b/images/neymar-jr-angers-x-psg-160121.jpg differ
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..dac17be281ce349af3d16e22fda95372e2930116
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,7 @@
+mmcv-full==1.5.2
+mmdet==2.25.0
+numpy==1.22.4
+opencv-python-headless==4.5.5.64
+openmim==0.1.5
+torch==1.11.0
+torchvision==0.12.0
diff --git a/style.css b/style.css
new file mode 100644
index 0000000000000000000000000000000000000000..22ad0be91ed35841bc456be4a0044474affc9a17
--- /dev/null
+++ b/style.css
@@ -0,0 +1,16 @@
+h1 {
+  text-align: center;
+}
+#input-image {
+  max-height: 300px;
+}
+#label-image {
+  height: 300px;
+}
+#result-image {
+  height: 300px;
+}
+img#visitor-badge {
+  display: block;
+  margin: auto;
+}
diff --git a/utils.py b/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f966daf6533811ab2a96bcdc84c2cf9e7360ad8
--- /dev/null
+++ b/utils.py
@@ -0,0 +1,288 @@
+from typing import Tuple
+import os.path as osp
+import PIL
+import mmcv
+import numpy as np
+from detectron2.utils.colormap import colormap
+from detectron2.utils.visualizer import VisImage, Visualizer
+from mmdet.datasets.coco_panoptic import INSTANCE_OFFSET
+
+CLASSES = [
+    'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train',
+    'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign',
+    'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
+    'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag',
+    'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite',
+    'baseball bat', 'baseball glove', 'skateboard', 'surfboard',
+    'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon',
+    'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot',
+    'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant',
+    'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote',
+    'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink',
+    'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear',
+    'hair drier', 'toothbrush', 'banner', 'blanket', 'bridge', 'cardboard',
+    'counter', 'curtain', 'door-stuff', 'floor-wood', 'flower', 'fruit',
+    'gravel', 'house', 'light', 'mirror-stuff', 'net', 'pillow', 'platform',
+    'playingfield', 'railroad', 'river', 'road', 'roof', 'sand', 'sea',
+    'shelf', 'snow', 'stairs', 'tent', 'towel', 'wall-brick', 'wall-stone',
+    'wall-tile', 'wall-wood', 'water-other', 'window-blind', 'window-other',
+    'tree-merged', 'fence-merged', 'ceiling-merged', 'sky-other-merged',
+    'cabinet-merged', 'table-merged', 'floor-other-merged', 'pavement-merged',
+    'mountain-merged', 'grass-merged', 'dirt-merged', 'paper-merged',
+    'food-other-merged', 'building-other-merged', 'rock-merged',
+    'wall-other-merged', 'rug-merged', 'background'
+]
+
+PREDICATES = [
+    'over',
+    'in front of',
+    'beside',
+    'on',
+    'in',
+    'attached to',
+    'hanging from',
+    'on back of',
+    'falling off',
+    'going down',
+    'painted on',
+    'walking on',
+    'running on',
+    'crossing',
+    'standing on',
+    'lying on',
+    'sitting on',
+    'flying over',
+    'jumping over',
+    'jumping from',
+    'wearing',
+    'holding',
+    'carrying',
+    'looking at',
+    'guiding',
+    'kissing',
+    'eating',
+    'drinking',
+    'feeding',
+    'biting',
+    'catching',
+    'picking',
+    'playing with',
+    'chasing',
+    'climbing',
+    'cleaning',
+    'playing',
+    'touching',
+    'pushing',
+    'pulling',
+    'opening',
+    'cooking',
+    'talking to',
+    'throwing',
+    'slicing',
+    'driving',
+    'riding',
+    'parked on',
+    'driving on',
+    'about to hit',
+    'kicking',
+    'swinging',
+    'entering',
+    'exiting',
+    'enclosing',
+    'leaning on',
+]
+
+
+def get_colormap(num_colors: int):
+    return (np.resize(colormap(), (num_colors, 3))).tolist()
+
+
+def draw_text(
+    viz_img: VisImage = None,
+    text: str = None,
+    x: float = None,
+    y: float = None,
+    color: Tuple[float, float, float] = [0, 0, 0],
+    size: float = 10,
+    padding: float = 5,
+    box_color: str = 'black',
+    font: str = None,
+) -> float:
+    text_obj = viz_img.ax.text(
+        x,
+        y,
+        text,
+        size=size,
+        # family="sans-serif",
+        bbox={
+            'facecolor': box_color,
+            'alpha': 0.8,
+            'pad': padding,
+            'edgecolor': 'none',
+        },
+        verticalalignment='top',
+        horizontalalignment='left',
+        color=color,
+        zorder=10,
+        rotation=0,
+    )
+    viz_img.get_image()
+    text_dims = text_obj.get_bbox_patch().get_extents()
+
+    return text_dims.width
+
+
+def show_result(img,
+                result,
+                is_one_stage,
+                num_rel=20,
+                show=False,
+                out_dir=None,
+                out_file=None):
+    # Load image
+    img = mmcv.imread(img)
+    img = img.copy()  # (H, W, 3)
+    img_h, img_w = img.shape[:-1]
+    
+    # Decrease contrast
+    img = PIL.Image.fromarray(img)
+    converter = PIL.ImageEnhance.Color(img)
+    img = converter.enhance(0.01)
+    if out_file is not None:
+        mmcv.imwrite(np.asarray(img), 'bw'+out_file)
+
+    # Draw masks
+    pan_results = result.pan_results
+
+    ids = np.unique(pan_results)[::-1]
+    num_classes = 133
+    legal_indices = (ids != num_classes)  # for VOID label
+    ids = ids[legal_indices]
+
+    # Get predicted labels
+    labels = np.array([id % INSTANCE_OFFSET for id in ids], dtype=np.int64)
+    labels = [CLASSES[l] for l in labels]
+
+    #For psgtr
+    rel_obj_labels = result.labels
+    rel_obj_labels = [CLASSES[l - 1] for l in rel_obj_labels]
+
+    # (N_m, H, W)
+    segms = pan_results[None] == ids[:, None, None]
+    # Resize predicted masks
+    segms = [
+        mmcv.image.imresize(m.astype(float), (img_w, img_h)) for m in segms
+    ]
+    # One stage segmentation
+    masks = result.masks
+
+    # Choose colors for each instance in coco
+    colormap_coco = get_colormap(len(masks)) if is_one_stage else get_colormap(len(segms))
+    colormap_coco = (np.array(colormap_coco) / 255).tolist()
+
+    # Viualize masks
+    viz = Visualizer(img)
+    viz.overlay_instances(
+        labels=rel_obj_labels if is_one_stage else labels,
+        masks=masks if is_one_stage else segms,
+        assigned_colors=colormap_coco,
+    )
+    viz_img = viz.get_output().get_image()
+    if out_file is not None:
+        mmcv.imwrite(viz_img, out_file)
+
+    # Draw relations
+
+    # Filter out relations
+    n_rel_topk = num_rel
+    # Exclude background class
+    rel_dists = result.rel_dists[:, 1:]
+    # rel_dists = result.rel_dists
+    rel_scores = rel_dists.max(1)
+    # rel_scores = result.triplet_scores
+    # Extract relations with top scores
+    rel_topk_idx = np.argpartition(rel_scores, -n_rel_topk)[-n_rel_topk:]
+    rel_labels_topk = rel_dists[rel_topk_idx].argmax(1)
+    rel_pair_idxes_topk = result.rel_pair_idxes[rel_topk_idx]
+    relations = np.concatenate(
+        [rel_pair_idxes_topk, rel_labels_topk[..., None]], axis=1)
+    n_rels = len(relations)
+    
+    top_padding = 20
+    bottom_padding = 20
+    left_padding = 20
+    text_size = 10
+    text_padding = 5
+    text_height = text_size + 2 * text_padding
+    row_padding = 10
+    height = (top_padding + bottom_padding + n_rels *
+              (text_height + row_padding) - row_padding)
+    width = img_w
+    curr_x = left_padding
+    curr_y = top_padding
+    
+    # # Adjust colormaps
+    # colormap_coco = [adjust_text_color(c, viz) for c in colormap_coco]
+    viz_graph = VisImage(np.full((height, width, 3), 255))
+    
+    all_rel_vis = []
+    
+    for i, r in enumerate(relations):
+        s_idx, o_idx, rel_id = r
+        s_label = rel_obj_labels[s_idx]
+        o_label = rel_obj_labels[o_idx]
+        rel_label = PREDICATES[rel_id]
+        viz = Visualizer(img)
+        viz.overlay_instances(
+            labels=[s_label, o_label],
+            masks=[masks[s_idx], masks[o_idx]],
+            assigned_colors=[colormap_coco[s_idx], colormap_coco[o_idx]],
+        )
+        viz_masked_img = viz.get_output().get_image()
+
+        viz_graph = VisImage(np.full((40, width, 3), 255))
+        curr_x = 2
+        curr_y = 2
+        text_size = 25
+        text_padding = 20
+        font = 36
+        text_width = draw_text(
+            viz_img=viz_graph,
+            text=s_label,
+            x=curr_x,
+            y=curr_y,
+            color=colormap_coco[s_idx],
+            size=text_size,
+            padding=text_padding,
+            font=font,
+        )
+        curr_x += text_width
+        # Draw relation text
+        text_width = draw_text(
+            viz_img=viz_graph,
+            text=rel_label,
+            x=curr_x,
+            y=curr_y,
+            size=text_size,
+            padding=text_padding,
+            box_color='gainsboro',
+            font=font,
+        )
+        curr_x += text_width
+
+        # Draw object text
+        text_width = draw_text(
+            viz_img=viz_graph,
+            text=o_label,
+            x=curr_x,
+            y=curr_y,
+            color=colormap_coco[o_idx],
+            size=text_size,
+            padding=text_padding,
+            font=font,
+        )
+        output_viz_graph = np.vstack([viz_masked_img, viz_graph.get_image()])
+        if show:
+           all_rel_vis.append(output_viz_graph)
+
+    return all_rel_vis
\ No newline at end of file