# coding=utf-8 # Copyright 2021 The Deeplab2 Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Provides data from segmentation datasets. Currently, we support the following datasets: 1. Cityscapes dataset (https://www.cityscapes-dataset.com). The Cityscapes dataset contains 19 semantic labels (such as road, person, car, and so on) for urban street scenes. 2. KITTI-STEP (http://www.cvlibs.net/datasets/kitti/). The KITTI-STEP enriches the KITTI-MOTS data with additional `stuff' anntotations. 3. MOTChallenge-STEP (https://motchallenge.net/). The MOTChallenge-STEP enriches the MOTSChallenge data with additional `stuff' annotations. 4. MSCOCO panoptic segmentation (http://cocodataset.org/#panoptic-2018). Panoptic segmentation annotations for MSCOCO dataset. Note that we convert the provided MSCOCO panoptic segmentation format to the following one: panoptic label = semantic label * 256 + instance id. 5. Cityscapes-DVPS (https://github.com/joe-siyuan-qiao/ViP-DeepLab) The Cityscapes-DVPS dataset augments Cityscapes-VPS (https://github.com/mcahny/vps) with depth annotations. References: - Marius Cordts, Mohamed Omran, Sebastian Ramos, Timo Rehfeld, Markus Enzweiler, Rodrigo Benenson, Uwe Franke, Stefan Roth, and Bernt Schiele, "The Cityscapes Dataset for Semantic Urban Scene Understanding." In CVPR, 2016. - Andreas Geiger and Philip Lenz and Raquel Urtasun, "Are we ready for Autonomous Driving? The KITTI Vision Benchmark Suite." In CVPR, 2012. - Alexander Kirillov, Kaiming He, Ross Girshick, Carsten Rother, and Piotr Dollar, "Panoptic Segmentation." In CVPR, 2019. - Tsung-Yi Lin, Michael Maire, Serge J. Belongie, Lubomir D. Bourdev, Ross B. Girshick, James Hays, Pietro Perona, Deva Ramanan, Piotr Dollar, and C. Lawrence Zitnick, "Microsoft COCO: common objects in context." In ECCV, 2014. - Anton Milan, Laura Leal-Taixe, Ian Reid, Stefan Roth, and Konrad Schindler, "Mot16: A benchmark for multi-object tracking." arXiv:1603.00831, 2016. - Paul Voigtlaender, Michael Krause, Aljosa Osep, Jonathon Luiten, Berin Balachandar Gnana Sekar, Andreas Geiger, and Bastian Leibe. "MOTS: Multi-object tracking and segmentation." In CVPR, 2019 - Mark Weber, Jun Xie, Maxwell Collins, Yukun Zhu, Paul Voigtlaender, Hartwig Adam, Bradley Green, Andreas Geiger, Bastian Leibe, Daniel Cremers, Aljosa Osep, Laura Leal-Taixe, and Liang-Chieh Chen, "STEP: Segmenting and Tracking Every Pixel." arXiv: 2102.11859, 2021. - Dahun Kim, Sanghyun Woo, Joon-Young Lee, and In So Kweon. "Video panoptic segmentation." In CVPR, 2020. - Siyuan Qiao, Yukun Zhu, Hartwig Adam, Alan Yuille, and Liang-Chieh Chen. "ViP-DeepLab: Learning Visual Perception with Depth-aware Video Panoptic Segmentation." In CVPR, 2021. """ import collections # Dataset names. _CITYSCAPES = 'cityscapes' _CITYSCAPES_PANOPTIC = 'cityscapes_panoptic' _KITTI_STEP = 'kitti_step' _MOTCHALLENGE_STEP = 'motchallenge_step' _CITYSCAPES_DVPS = 'cityscapes_dvps' _COCO_PANOPTIC = 'coco_panoptic' # Colormap names. _CITYSCAPES_COLORMAP = 'cityscapes' _MOTCHALLENGE_COLORMAP = 'motchallenge' _COCO_COLORMAP = 'coco' # Named tuple to describe dataset properties. DatasetDescriptor = collections.namedtuple( 'DatasetDescriptor', [ 'dataset_name', # Dataset name. 'splits_to_sizes', # Splits of the dataset into training, val and test. 'num_classes', # Number of semantic classes. 'ignore_label', # Ignore label value used for semantic segmentation. # Fields below are used for panoptic segmentation and will be None for # Semantic segmentation datasets. # Label divisor only used in panoptic segmentation annotation to infer # semantic label and instance id. 'panoptic_label_divisor', # A tuple of classes that contains instance annotations. For example, # 'person' class has instance annotations while 'sky' does not. 'class_has_instances_list', # A flag indicating whether the dataset is a video dataset that contains # sequence IDs and frame IDs. 'is_video_dataset', # A string specifying the colormap that should be used for # visualization. E.g. 'cityscapes'. 'colormap', # A flag indicating whether the dataset contains depth annotation. 'is_depth_dataset', ] ) CITYSCAPES_INFORMATION = DatasetDescriptor( dataset_name=_CITYSCAPES, splits_to_sizes={'train_fine': 2975, 'train_coarse': 22973, 'trainval_fine': 3475, 'trainval_coarse': 23473, 'val_fine': 500, 'test_fine': 1525}, num_classes=19, ignore_label=255, panoptic_label_divisor=None, class_has_instances_list=None, is_video_dataset=False, colormap=_CITYSCAPES_COLORMAP, is_depth_dataset=False, ) CITYSCAPES_PANOPTIC_INFORMATION = DatasetDescriptor( dataset_name=_CITYSCAPES_PANOPTIC, splits_to_sizes={'train_fine': 2975, 'val_fine': 500, 'trainval_fine': 3475, 'test_fine': 1525}, num_classes=19, ignore_label=255, panoptic_label_divisor=1000, class_has_instances_list=tuple(range(11, 19)), is_video_dataset=False, colormap=_CITYSCAPES_COLORMAP, is_depth_dataset=False, ) KITTI_STEP_INFORMATION = DatasetDescriptor( dataset_name=_KITTI_STEP, splits_to_sizes={'train': 5027, 'val': 2981, 'test': 11095}, num_classes=19, ignore_label=255, panoptic_label_divisor=1000, class_has_instances_list=(11, 13), is_video_dataset=True, colormap=_CITYSCAPES_COLORMAP, is_depth_dataset=False, ) MOTCHALLENGE_STEP_INFORMATION = DatasetDescriptor( dataset_name=_MOTCHALLENGE_STEP, splits_to_sizes={'train': 525, # Sequence 9. 'val': 600, # Sequence 2. 'test': 0}, num_classes=7, ignore_label=255, panoptic_label_divisor=1000, class_has_instances_list=(4,), is_video_dataset=True, colormap=_MOTCHALLENGE_COLORMAP, is_depth_dataset=False, ) CITYSCAPES_DVPS_INFORMATION = DatasetDescriptor( dataset_name=_CITYSCAPES_DVPS, # The numbers of images are 2400/300/300 for train/val/test. Here, the # sizes are the number of consecutive frame pairs. As each sequence has 6 # frames, the number of pairs for the train split is 2400 / 6 * 5 = 2000. # Similarly, we get 250 pairs for the val split and the test split. splits_to_sizes={'train': 2000, 'val': 250, 'test': 250}, num_classes=19, ignore_label=255, panoptic_label_divisor=1000, class_has_instances_list=tuple(range(11, 19)), is_video_dataset=True, colormap=_CITYSCAPES_COLORMAP, is_depth_dataset=True, ) COCO_PANOPTIC_INFORMATION = DatasetDescriptor( dataset_name=_COCO_PANOPTIC, splits_to_sizes={'train': 118287, 'val': 5000, 'test': 40670}, num_classes=134, ignore_label=0, panoptic_label_divisor=256, class_has_instances_list=tuple(range(1, 81)), is_video_dataset=False, colormap=_COCO_COLORMAP, is_depth_dataset=False, ) MAP_NAME_TO_DATASET_INFO = { _CITYSCAPES: CITYSCAPES_INFORMATION, _CITYSCAPES_PANOPTIC: CITYSCAPES_PANOPTIC_INFORMATION, _KITTI_STEP: KITTI_STEP_INFORMATION, _MOTCHALLENGE_STEP: MOTCHALLENGE_STEP_INFORMATION, _CITYSCAPES_DVPS: CITYSCAPES_DVPS_INFORMATION, _COCO_PANOPTIC: COCO_PANOPTIC_INFORMATION, } MAP_NAMES = list(MAP_NAME_TO_DATASET_INFO.keys())