|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
r"""Unbatch RefCOCO, RefCOCO+, RefCOCOg datasets in TFDS structure."""
|
|
|
|
|
|
|
|
import io
|
|
import os
|
|
import pickle
|
|
|
|
import numpy as np
|
|
import PIL.Image
|
|
import pycocotools.coco
|
|
import tensorflow_datasets as tfds
|
|
|
|
_ROOT_PATH = '/tmp/data/'
|
|
|
|
|
|
class RefCocoConfig(tfds.core.BuilderConfig):
|
|
"""Config to specify each RefCoco variant."""
|
|
|
|
def __init__(self, dataset, dataset_partition, **kwargs):
|
|
name = f'{dataset}_{dataset_partition}'
|
|
super(RefCocoConfig, self).__init__(name=name, **kwargs)
|
|
self.dataset = dataset
|
|
self.dataset_partition = dataset_partition
|
|
|
|
|
|
_DESCRIPTION = """RefCOCO, RefCOCO+, RefCOCOg datasets.
|
|
|
|
Images, boxes and segmentations are from the original COCO dataset
|
|
(Lin et al, ECCV 2014). The referential segmentations are from two different
|
|
sources:
|
|
|
|
1) RefCOCOg (Mao et al, CVPR 2016):
|
|
- https://github.com/mjhucla/Google_Refexp_toolbox
|
|
- This is the split used in the "refcocog_google" dataset. Note that this
|
|
split has overlapping images in train/validation. The same split is also
|
|
provided in 2).
|
|
|
|
2) Source of RefCOCO and RefCOCO+ (Yu et al, ECCV 2016):
|
|
- https://github.com/lichengunc/refer
|
|
- Apache License 2.0
|
|
- Provides all the splits used for generation of these datasets, including the
|
|
"refcocog_google" split that is identical with the split from 1).
|
|
|
|
For convenience, we provide an additional dataset "refcocox_combined" that
|
|
combines the datasets "refcoco_unc", "refcocoplus_unc", and "refcocog_umd",
|
|
unifying "testA" and "testB" into a single "test" split, and removing any images
|
|
from "train" that appear either in "validation" or "test".
|
|
|
|
Also for convenience, every split is unrolled twice (at the "objects" level and
|
|
at the "object/refs" level) and saved as "{split}_flat".
|
|
"""
|
|
|
|
|
|
_CITATION = r"""
|
|
@inproceedings{DBLP:conf/cvpr/MaoHTCY016,
|
|
author = {Junhua Mao and
|
|
Jonathan Huang and
|
|
Alexander Toshev and
|
|
Oana Camburu and
|
|
Alan L. Yuille and
|
|
Kevin Murphy},
|
|
title = {Generation and Comprehension of Unambiguous Object Descriptions},
|
|
booktitle = {2016 {IEEE} Conference on Computer Vision and Pattern Recognition,
|
|
{CVPR} 2016, Las Vegas, NV, USA, June 27-30, 2016},
|
|
pages = {11--20},
|
|
publisher = {{IEEE} Computer Society},
|
|
year = {2016},
|
|
url = {https://doi.org/10.1109/CVPR.2016.9},
|
|
doi = {10.1109/CVPR.2016.9},
|
|
timestamp = {Fri, 24 Mar 2023 00:02:52 +0100},
|
|
biburl = {https://dblp.org/rec/conf/cvpr/MaoHTCY016.bib},
|
|
bibsource = {dblp computer science bibliography, https://dblp.org}
|
|
}
|
|
|
|
@inproceedings{DBLP:conf/eccv/YuPYBB16,
|
|
author = {Licheng Yu and
|
|
Patrick Poirson and
|
|
Shan Yang and
|
|
Alexander C. Berg and
|
|
Tamara L. Berg},
|
|
editor = {Bastian Leibe and
|
|
Jiri Matas and
|
|
Nicu Sebe and
|
|
Max Welling},
|
|
title = {Modeling Context in Referring Expressions},
|
|
booktitle = {Computer Vision - {ECCV} 2016 - 14th European Conference, Amsterdam,
|
|
The Netherlands, October 11-14, 2016, Proceedings, Part {II}},
|
|
series = {Lecture Notes in Computer Science},
|
|
volume = {9906},
|
|
pages = {69--85},
|
|
publisher = {Springer},
|
|
year = {2016},
|
|
url = {https://doi.org/10.1007/978-3-319-46475-6\_5},
|
|
doi = {10.1007/978-3-319-46475-6\_5},
|
|
timestamp = {Wed, 07 Dec 2022 23:10:23 +0100},
|
|
biburl = {https://dblp.org/rec/conf/eccv/YuPYBB16.bib},
|
|
bibsource = {dblp computer science bibliography, https://dblp.org}
|
|
}
|
|
|
|
@article{DBLP:journals/corr/LinMBHPRDZ14,
|
|
author = {Tsung{-}Yi Lin and
|
|
Michael Maire and
|
|
Serge J. Belongie and
|
|
Lubomir D. Bourdev and
|
|
Ross B. Girshick and
|
|
James Hays and
|
|
Pietro Perona and
|
|
Deva Ramanan and
|
|
Piotr Doll{\'{a}}r and
|
|
C. Lawrence Zitnick},
|
|
title = {Microsoft {COCO:} Common Objects in Context},
|
|
journal = {CoRR},
|
|
volume = {abs/1405.0312},
|
|
year = {2014},
|
|
url = {http://arxiv.org/abs/1405.0312},
|
|
archivePrefix = {arXiv},
|
|
eprint = {1405.0312},
|
|
timestamp = {Mon, 13 Aug 2018 16:48:13 +0200},
|
|
biburl = {https://dblp.org/rec/bib/journals/corr/LinMBHPRDZ14},
|
|
bibsource = {dblp computer science bibliography, https://dblp.org}
|
|
}
|
|
"""
|
|
|
|
|
|
|
|
LICENSES = [
|
|
'Attribution-NonCommercial-ShareAlike License',
|
|
'Attribution-NonCommercial License',
|
|
'Attribution-NonCommercial-NoDerivs License',
|
|
'Attribution License',
|
|
'Attribution-ShareAlike License',
|
|
'Attribution-NoDerivs License',
|
|
'No known copyright restrictions',
|
|
'United States Government Work',
|
|
]
|
|
|
|
_licenses_map = {1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7}
|
|
|
|
|
|
|
|
CATEGORIES = [
|
|
'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train',
|
|
'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign',
|
|
'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
|
|
'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag',
|
|
'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite',
|
|
'baseball bat', 'baseball glove', 'skateboard', 'surfboard',
|
|
'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon',
|
|
'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot',
|
|
'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant',
|
|
'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote',
|
|
'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink',
|
|
'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear',
|
|
'hair drier', 'toothbrush',
|
|
]
|
|
|
|
SUPERCATEGORIES = [
|
|
'accessory', 'animal', 'appliance', 'electronic', 'food', 'furniture',
|
|
'indoor', 'kitchen', 'outdoor', 'person', 'sports', 'vehicle',
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class RefCocoBv(tfds.core.GeneratorBasedBuilder):
|
|
"""DatasetBuilder for RefCoco datasets."""
|
|
|
|
VERSION = tfds.core.Version('1.4.0')
|
|
RELEASE_NOTES = {
|
|
'1.4.0': 'Added flat versions of all dataset splits.',
|
|
'1.3.0': 'Added "refcocox_combined" dataset.',
|
|
'1.2.0': 'Added "train_flat" splits.',
|
|
'1.1.0': 'Added more features (mask etc), nested "refs" in "objects".',
|
|
'1.0.0': 'Initial release.',
|
|
}
|
|
|
|
MANUAL_DOWNLOAD_INSTRUCTIONS = """
|
|
1. Install https://pypi.org/project/pycocotools/.
|
|
|
|
2. Download data (requires ~20G for COCO images):
|
|
|
|
(mkdir -p /tmp/tfds/downloads/manual &&
|
|
cd /tmp/tfds/downloads/manual &&
|
|
wget http://images.cocodataset.org/zips/train2017.zip &&
|
|
wget http://images.cocodataset.org/annotations/annotations_trainval2017.zip &&
|
|
wget https://bvisionweb1.cs.unc.edu/licheng/referit/data/refcoco.zip &&
|
|
wget https://bvisionweb1.cs.unc.edu/licheng/referit/data/refcoco+.zip &&
|
|
wget https://bvisionweb1.cs.unc.edu/licheng/referit/data/refcocog.zip &&
|
|
for zip in *.zip; do unzip $zip; done
|
|
)
|
|
|
|
3. Run the generation script with `TFDS_DATA_DIR=/tmp/tfds`
|
|
"""
|
|
|
|
BUILDER_CONFIGS = [
|
|
RefCocoConfig(dataset='refcoco', dataset_partition='unc'),
|
|
RefCocoConfig(dataset='refcoco', dataset_partition='google'),
|
|
RefCocoConfig(dataset='refcocoplus', dataset_partition='unc'),
|
|
RefCocoConfig(dataset='refcocog', dataset_partition='google'),
|
|
RefCocoConfig(dataset='refcocog', dataset_partition='umd'),
|
|
RefCocoConfig(dataset='refcocox', dataset_partition='combined'),
|
|
]
|
|
|
|
def _info(self) -> tfds.core.DatasetInfo:
|
|
return tfds.core.DatasetInfo(
|
|
builder=self,
|
|
features=tfds.features.FeaturesDict({
|
|
'id': tfds.features.Scalar(np.int32),
|
|
'image': tfds.features.Image(encoding_format='jpeg'),
|
|
'height': tfds.features.Scalar(np.int32),
|
|
'width': tfds.features.Scalar(np.int32),
|
|
'license': tfds.features.ClassLabel(names=LICENSES),
|
|
'file_name': tfds.features.Text(),
|
|
'flickr_url': tfds.features.Text(),
|
|
'coco_url': tfds.features.Text(),
|
|
'objects': tfds.features.Sequence({
|
|
'id': tfds.features.Scalar(np.int64),
|
|
'area': tfds.features.Scalar(np.float32),
|
|
'bbox': tfds.features.BBoxFeature(),
|
|
'mask': tfds.features.Image(encoding_format='png'),
|
|
'category': tfds.features.ClassLabel(names=CATEGORIES),
|
|
'supercategory': tfds.features.ClassLabel(
|
|
names=SUPERCATEGORIES
|
|
),
|
|
'iscrowd': tfds.features.Scalar(np.bool_),
|
|
|
|
'refs': tfds.features.Sequence({
|
|
'id': tfds.features.Scalar(np.int32),
|
|
'sentence': tfds.features.Text(),
|
|
}),
|
|
}),
|
|
}),
|
|
supervised_keys=None,
|
|
citation=_CITATION,
|
|
description=_DESCRIPTION,
|
|
)
|
|
|
|
def _split_generators(self, dl_manager: tfds.download.DownloadManager):
|
|
allowed_splits = {
|
|
('refcoco', 'google'): [
|
|
tfds.Split.TRAIN,
|
|
tfds.Split.VALIDATION,
|
|
tfds.Split.TEST,
|
|
],
|
|
('refcoco', 'unc'): [
|
|
tfds.Split.TRAIN,
|
|
tfds.Split.VALIDATION,
|
|
'testA',
|
|
'testB',
|
|
],
|
|
('refcocoplus', 'unc'): [
|
|
tfds.Split.TRAIN,
|
|
tfds.Split.VALIDATION,
|
|
'testA',
|
|
'testB',
|
|
],
|
|
|
|
|
|
('refcocog', 'google'): [
|
|
tfds.Split.TRAIN,
|
|
tfds.Split.VALIDATION,
|
|
],
|
|
('refcocog', 'umd'): [
|
|
tfds.Split.TRAIN,
|
|
tfds.Split.VALIDATION,
|
|
tfds.Split.TEST,
|
|
],
|
|
('refcocox', 'combined'): [
|
|
tfds.Split.TRAIN,
|
|
tfds.Split.VALIDATION,
|
|
tfds.Split.TEST,
|
|
],
|
|
}
|
|
bc = self.builder_config
|
|
splits = allowed_splits[(bc.dataset, bc.dataset_partition)]
|
|
|
|
data_dir = dl_manager.manual_dir
|
|
for url, components in (
|
|
|
|
|
|
('http://images.cocodataset.org/zips/train2017.zip', ('train2017', '000000147328.jpg')),
|
|
('http://images.cocodataset.org/annotations/annotations_trainval2017.zip', ('annotations', 'instances_train2017.json')),
|
|
('https://bvisionweb1.cs.unc.edu/licheng/referit/data/refcoco.zip', ('refcoco', 'refs(unc).p')),
|
|
('https://bvisionweb1.cs.unc.edu/licheng/referit/data/refcoco+.zip', ('refcoco+', 'refs(unc).p')),
|
|
('https://bvisionweb1.cs.unc.edu/licheng/referit/data/refcocog.zip', ('refcocog', 'refs(umd).p')),
|
|
|
|
|
|
):
|
|
path = os.path.exists(os.path.join(data_dir, *components))
|
|
if not path:
|
|
raise FileNotFoundError(
|
|
f'Could not find {path}: please download {url} and unzip into'
|
|
f' {data_dir}'
|
|
)
|
|
|
|
coco = pycocotools.coco.COCO(
|
|
os.path.join(data_dir, 'annotations', 'instances_train2017.json')
|
|
)
|
|
|
|
return {
|
|
split + suffix: self._generate_examples(
|
|
coco, data_dir, bc.dataset, bc.dataset_partition, split + suffix,
|
|
)
|
|
for split in splits
|
|
for suffix in ('', '_flat')
|
|
}
|
|
|
|
|
|
def _generate_examples(
|
|
self, coco, data_dir, dataset, dataset_partition, split):
|
|
return _generate_examples(coco, data_dir, dataset, dataset_partition, split)
|
|
|
|
|
|
def _get_ids(data_dir, dataset, dataset_partition, split):
|
|
"""Returns `img_ids, ann_to_refs` for specified dataset/partition/split."""
|
|
|
|
def load(dataset, dataset_partition):
|
|
fname = f'refs({dataset_partition}).p'
|
|
path = os.path.join(data_dir, dataset, fname)
|
|
refcoco = pickle.load(open(path, 'rb'))
|
|
return refcoco
|
|
|
|
if split == tfds.Split.VALIDATION:
|
|
split = 'val'
|
|
|
|
if (dataset, dataset_partition) == ('refcocox', 'combined'):
|
|
refcoco = (
|
|
load('refcocog', 'umd')
|
|
+ load('refcoco', 'unc')
|
|
+ load('refcoco+', 'unc')
|
|
)
|
|
if split == 'test':
|
|
splits = ('test', 'testA', 'testB')
|
|
else:
|
|
splits = (split,)
|
|
|
|
exclude_img_ids = set()
|
|
if split == 'train':
|
|
|
|
exclude_img_ids = {
|
|
r['image_id'] for r in refcoco if r['split'] != 'train'
|
|
}
|
|
refcoco = [
|
|
r
|
|
for r in refcoco
|
|
if r['split'] in splits and r['image_id'] not in exclude_img_ids
|
|
]
|
|
|
|
else:
|
|
if dataset == 'refcocoplus':
|
|
dataset = 'refcoco+'
|
|
refcoco = load(dataset, dataset_partition)
|
|
refcoco = [r for r in refcoco if r['split'] == split]
|
|
|
|
img_ids = {r['image_id'] for r in refcoco}
|
|
ann_to_refs = {}
|
|
for r in refcoco:
|
|
for sent in r['sentences']:
|
|
ann_to_refs.setdefault(r['ann_id'], []).append(dict(
|
|
id=sent['sent_id'],
|
|
sentence=sent['sent']
|
|
))
|
|
|
|
return img_ids, ann_to_refs
|
|
|
|
|
|
def _generate_examples(coco, data_dir, dataset, dataset_partition, split):
|
|
"""Generates examples for a given split."""
|
|
|
|
flat = '_flat' in split
|
|
split = split.replace('_flat', '')
|
|
img_ids, ann_to_refs = _get_ids(data_dir, dataset, dataset_partition, split)
|
|
|
|
for img_id in coco.getImgIds():
|
|
|
|
if img_id not in img_ids:
|
|
continue
|
|
img, = coco.loadImgs([img_id])
|
|
|
|
example = {
|
|
'id': img_id,
|
|
'image': os.path.join(data_dir, 'train2017', img['file_name']),
|
|
'height': img['height'],
|
|
'width': img['width'],
|
|
'license': LICENSES[_licenses_map[img['license']]],
|
|
'file_name': img['file_name'],
|
|
'flickr_url': img['flickr_url'],
|
|
'coco_url': img['coco_url'],
|
|
'objects': [],
|
|
}
|
|
for ann in coco.loadAnns(coco.getAnnIds(img_id)):
|
|
refs = ann_to_refs.get(ann['id'])
|
|
if not refs:
|
|
continue
|
|
cat, = coco.loadCats([ann['category_id']])
|
|
mask = coco.annToMask(ann).astype(np.bool_)
|
|
mask_buf = io.BytesIO()
|
|
PIL.Image.fromarray(mask).save(mask_buf, 'png')
|
|
mask_buf.seek(0)
|
|
object_ = {
|
|
'id': ann['id'],
|
|
'mask': mask_buf,
|
|
'category': cat['name'],
|
|
'supercategory': cat['supercategory'],
|
|
'iscrowd': ann['iscrowd'],
|
|
'area': ann['area'],
|
|
'bbox': _convert_bbox(img, *ann['bbox']),
|
|
'refs': refs,
|
|
}
|
|
if flat:
|
|
example['objects'] = [object_]
|
|
for ref_i, ref in enumerate(refs):
|
|
object_['refs'] = [ref]
|
|
mask_buf.seek(0)
|
|
yield f'{img_id}_{ann["id"]}_{ref_i}', example
|
|
else:
|
|
example['objects'].append(object_)
|
|
|
|
if not flat:
|
|
yield img_id, example
|
|
|
|
|
|
def _convert_bbox(img, x, y, w, h):
|
|
return tfds.features.BBox(
|
|
ymin=y / img['height'],
|
|
xmin=x / img['width'],
|
|
ymax=(y + h) / img['height'],
|
|
xmax=(x + w) / img['width'],
|
|
)
|
|
|