# -*- coding: utf-8 -*- # Copyright (c) Facebook, Inc. and its affiliates. import numpy as np import os import xml.etree.ElementTree as ET from typing import List, Tuple, Union from detectron2.data import DatasetCatalog, MetadataCatalog from detectron2.structures import BoxMode from detectron2.utils.file_io import PathManager __all__ = ["load_voc_instances", "register_pascal_voc"] # fmt: off CLASS_NAMES = ( "aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train", "tvmonitor" ) # fmt: on def load_voc_instances(dirname: str, split: str, class_names: Union[List[str], Tuple[str, ...]]): """ Load Pascal VOC detection annotations to Detectron2 format. Args: dirname: Contain "Annotations", "ImageSets", "JPEGImages" split (str): one of "train", "test", "val", "trainval" class_names: list or tuple of class names """ with PathManager.open(os.path.join(dirname, "ImageSets", "Main", split + ".txt")) as f: fileids = np.loadtxt(f, dtype=str) # Needs to read many small annotation files. Makes sense at local annotation_dirname = PathManager.get_local_path(os.path.join(dirname, "Annotations/")) dicts = [] for fileid in fileids: anno_file = os.path.join(annotation_dirname, fileid + ".xml") jpeg_file = os.path.join(dirname, "JPEGImages", fileid + ".jpg") with PathManager.open(anno_file) as f: tree = ET.parse(f) r = { "file_name": jpeg_file, "image_id": fileid, "height": int(tree.findall("./size/height")[0].text), "width": int(tree.findall("./size/width")[0].text), } instances = [] for obj in tree.findall("object"): cls = obj.find("name").text # We include "difficult" samples in training. # Based on limited experiments, they don't hurt accuracy. # difficult = int(obj.find("difficult").text) # if difficult == 1: # continue bbox = obj.find("bndbox") bbox = [float(bbox.find(x).text) for x in ["xmin", "ymin", "xmax", "ymax"]] # Original annotations are integers in the range [1, W or H] # Assuming they mean 1-based pixel indices (inclusive), # a box with annotation (xmin=1, xmax=W) covers the whole image. # In coordinate space this is represented by (xmin=0, xmax=W) bbox[0] -= 1.0 bbox[1] -= 1.0 instances.append( {"category_id": class_names.index(cls), "bbox": bbox, "bbox_mode": BoxMode.XYXY_ABS} ) r["annotations"] = instances dicts.append(r) return dicts def register_pascal_voc(name, dirname, split, year, class_names=CLASS_NAMES): DatasetCatalog.register(name, lambda: load_voc_instances(dirname, split, class_names)) MetadataCatalog.get(name).set( thing_classes=list(class_names), dirname=dirname, year=year, split=split )