There are two types of annotations for VG region captioning.

One is the original ones in densecap

- Use the split 77398 train, 5000 test: https://github.com/jcjohnson/densecap/blob/master/info/densecap_splits.json
- Preprocess data: https://github.com/jcjohnson/densecap/blob/master/doc/FLAGS.md#preprocesspy
 1. We remove region annotations if their captions are too long, 
 2. or if their bounding boxes have zero area. 
 3. replace a few special characters with more common variants
 4. convert rare words into a special token. 

The other is from GRiT https://github.com/JialianW/GRiT/blob/39b33dbc0900e4be0458af14597fcb1a82d933bb/datasets/DATASETS.md
- Split: 77,396 images for the train set and 5,000 images for the test set.
 - ∼4 million annotated region descriptions with over 50,000 unique words in the train set.
- preprocessing: 
 1. discard object descriptions with more than 15 words 
 2. and convert symbols into English words, e.g., ◦ → “degree”, which means it follows Densecap https://github.com/jcjohnson/densecap/blob/7c32170f134805debe638806ecb0a22bbcd58c5f/preprocess.py#L443


In [1]:
import json
import os

Inspect densecap

In [2]:
densecap_split_file = 'tmp/data/vg_data_json/densecap_splits.json'
densecap_annot_file = 'tmp/data/vg_data_json/region_descriptions.json'
densecap_image_meta_file = 'tmp/data/vg_data_json/image_data.json'

In [3]:
with open(densecap_split_file, 'r') as f:
 densecap_split = json.load(f)
with open(densecap_annot_file, 'r') as f:
 densecap_annot = json.load(f)
with open(densecap_image_meta_file, 'r') as f:
 densecap_image_meta = json.load(f)

In [25]:
densecap_image_meta

[{'width': 800,
 'url': 'https://cs.stanford.edu/people/rak248/VG_100K_2/1.jpg',
 'height': 600,
 'image_id': 1,
 'coco_id': None,
 'flickr_id': None},
 {'width': 800,
 'url': 'https://cs.stanford.edu/people/rak248/VG_100K/2.jpg',
 'height': 600,
 'image_id': 2,
 'coco_id': None,
 'flickr_id': None},
 {'width': 640,
 'url': 'https://cs.stanford.edu/people/rak248/VG_100K/3.jpg',
 'height': 480,
 'image_id': 3,
 'coco_id': None,
 'flickr_id': None},
 {'width': 640,
 'url': 'https://cs.stanford.edu/people/rak248/VG_100K/4.jpg',
 'height': 480,
 'image_id': 4,
 'coco_id': None,
 'flickr_id': None},
 {'width': 800,
 'url': 'https://cs.stanford.edu/people/rak248/VG_100K/5.jpg',
 'height': 600,
 'image_id': 5,
 'coco_id': None,
 'flickr_id': None},
 {'width': 800,
 'url': 'https://cs.stanford.edu/people/rak248/VG_100K/6.jpg',
 'height': 600,
 'image_id': 6,
 'coco_id': None,
 'flickr_id': None},
 {'width': 800,
 'url': 'https://cs.stanford.edu/people/rak248/VG_100K/7.jpg',
 'height': 600,
 'i

In [4]:
densecap_split.keys()
for k, v in densecap_split.items():
 print(k, len(v))

test 5000
train 77398
val 5000


In [5]:
img_id = 0
region_id = 3
densecap_annot[img_id]["id"], densecap_annot[img_id]["regions"][region_id]

(1,
 {'region_id': 1385,
 'width': 36,
 'height': 36,
 'image_id': 1,
 'phrase': 'cars headlights are off',
 'y': 377,
 'x': 617})

In [6]:
len(densecap_image_meta), len(densecap_annot)

(108077, 108077)

In [7]:
densecap_train_img_id = set(densecap_split["train"])
densecap_test_img_id = set(densecap_split["test"])

Inspect grit

In [8]:
grit_train_annot_file = 'tmp/data/vg_data_json/train.json'
grit_test_annot_file = 'tmp/data/vg_data_json/test.json'

In [9]:
with open(grit_train_annot_file, 'r') as f:
 grit_train_annot = json.load(f)
with open(grit_test_annot_file, 'r') as f:
 grit_test_annot = json.load(f)

In [10]:
grit_train_annot.keys(), grit_test_annot.keys()

(dict_keys(['images', 'annotations', 'categories']),
 dict_keys(['images', 'annotations', 'categories']))

In [11]:
len(grit_train_annot["images"]), len(grit_test_annot["images"]), len(grit_train_annot["annotations"]), len(grit_test_annot["annotations"])

(77396, 5000, 3596689, 232935)

In [12]:
grit_train_annot["annotations"][0], grit_train_annot["images"][3]

({'id': 1,
 'iscrowd': 0,
 'area': 2499,
 'image_id': 51,
 'category_id': 1,
 'bbox': [312, 279, 49, 51],
 'caption': 'head of a person'},
 {'file_name': '55.jpg', 'height': 600, 'width': 800, 'id': 55})

In [13]:
[i["id"] for i in grit_train_annot["images"][:10]], [i["image_id"] for i in grit_train_annot["annotations"][:10]]

([51, 53, 54, 55, 60, 61, 62, 64, 65, 69],
 [51, 51, 51, 51, 51, 51, 51, 51, 51, 51])

In [14]:
grit_train_img_id_from_img = set([i["id"] for i in grit_train_annot["images"]])
grit_train_id_from_annot = set([i["image_id"] for i in grit_train_annot["annotations"]])
assert grit_train_img_id_from_img == grit_train_id_from_annot

grit_test_img_id_from_img = set([i["id"] for i in grit_test_annot["images"]])
grti_test_id_from_annot = set([i["image_id"] for i in grit_test_annot["annotations"]])
assert grit_test_img_id_from_img == grti_test_id_from_annot

In [15]:
len(grit_train_img_id_from_img), len(grit_test_img_id_from_img), len(grit_train_id_from_annot), len(grti_test_id_from_annot)

(77396, 5000, 77396, 5000)

In [30]:
grit_train_annot["categories"], grit_test_annot["categories"]

([{'id': 1, 'name': 'object'}], [{'id': 1, 'name': 'object'}])

In [32]:
grit_train_annot.keys(), grit_train_annot["images"]

(dict_keys(['images', 'annotations', 'categories']),
 [{'file_name': '51.jpg', 'height': 533, 'width': 800, 'id': 51},
 {'file_name': '53.jpg', 'height': 534, 'width': 800, 'id': 53},
 {'file_name': '54.jpg', 'height': 600, 'width': 800, 'id': 54},
 {'file_name': '55.jpg', 'height': 600, 'width': 800, 'id': 55},
 {'file_name': '60.jpg', 'height': 600, 'width': 800, 'id': 60},
 {'file_name': '61.jpg', 'height': 600, 'width': 800, 'id': 61},
 {'file_name': '62.jpg', 'height': 600, 'width': 800, 'id': 62},
 {'file_name': '64.jpg', 'height': 600, 'width': 800, 'id': 64},
 {'file_name': '65.jpg', 'height': 600, 'width': 800, 'id': 65},
 {'file_name': '69.jpg', 'height': 480, 'width': 640, 'id': 69},
 {'file_name': '70.jpg', 'height': 446, 'width': 600, 'id': 70},
 {'file_name': '71.jpg', 'height': 375, 'width': 500, 'id': 71},
 {'file_name': '72.jpg', 'height': 247, 'width': 495, 'id': 72},
 {'file_name': '73.jpg', 'height': 533, 'width': 800, 'id': 73},
 {'file_name': '74.jpg', 'height': 5

compare captions

In [26]:
def build_densecap_img_id_to_region(densecap_annot):
 img_id_to_region = {}
 for img in densecap_annot:
 img_id = img["id"]
 regions = img["regions"]
 img_id_to_region[img_id] = regions
 return img_id_to_region

def build_grit_img_id_to_regions(grit_annot):
 img_id_to_regions = {}
 for annot in grit_annot["annotations"]:
 img_id = annot["image_id"]
 if img_id not in img_id_to_regions:
 img_id_to_regions[img_id] = []
 img_id_to_regions[img_id].append(annot)
 return img_id_to_regions

def build_densecap_img_id_to_img(densecap_image_meta):
 img_id_to_img = {}
 for img in densecap_image_meta:
 img_id = img["image_id"]
 img_id_to_img[img_id] = img
 return img_id_to_img
densecap_img_id_to_region = build_densecap_img_id_to_region(densecap_annot)
grit_train_img_id_to_regions = build_grit_img_id_to_regions(grit_train_annot)
grit_test_img_id_to_regions = build_grit_img_id_to_regions(grit_test_annot)
densecap_img_id_to_img = build_densecap_img_id_to_img(densecap_image_meta)

In [27]:
densecap_img_id_to_img

{1: {'width': 800,
 'url': 'https://cs.stanford.edu/people/rak248/VG_100K_2/1.jpg',
 'height': 600,
 'image_id': 1,
 'coco_id': None,
 'flickr_id': None},
 2: {'width': 800,
 'url': 'https://cs.stanford.edu/people/rak248/VG_100K/2.jpg',
 'height': 600,
 'image_id': 2,
 'coco_id': None,
 'flickr_id': None},
 3: {'width': 640,
 'url': 'https://cs.stanford.edu/people/rak248/VG_100K/3.jpg',
 'height': 480,
 'image_id': 3,
 'coco_id': None,
 'flickr_id': None},
 4: {'width': 640,
 'url': 'https://cs.stanford.edu/people/rak248/VG_100K/4.jpg',
 'height': 480,
 'image_id': 4,
 'coco_id': None,
 'flickr_id': None},
 5: {'width': 800,
 'url': 'https://cs.stanford.edu/people/rak248/VG_100K/5.jpg',
 'height': 600,
 'image_id': 5,
 'coco_id': None,
 'flickr_id': None},
 6: {'width': 800,
 'url': 'https://cs.stanford.edu/people/rak248/VG_100K/6.jpg',
 'height': 600,
 'image_id': 6,
 'coco_id': None,
 'flickr_id': None},
 7: {'width': 800,
 'url': 'https://cs.stanford.edu/people/rak248/VG_100K/7.jpg'

In [17]:
densecap_train_img_id - grit_train_img_id_from_img, densecap_test_img_id - grit_test_img_id_from_img, len(densecap_train_img_id ), len(grit_train_img_id_to_regions)

({1650, 1684}, set(), 77398, 77396)

In [18]:
densecap_test_img_id - grit_test_img_id_from_img, grit_test_img_id_from_img - densecap_test_img_id

(set(), set())

In [19]:
densecap_split["train"][:10], densecap_split["test"][:10]

([2375025,
 2358524,
 2375338,
 2335199,
 2396428,
 2353294,
 2366245,
 2411288,
 2346175,
 2396322],
 [2342728,
 2414939,
 2397722,
 2386848,
 2398737,
 2367906,
 2384003,
 2399896,
 2359702,
 2414293])

In [20]:
img_id = 2396322; region_id = 0
densecap_img_id_to_region[img_id][region_id], grit_train_img_id_to_regions[img_id][region_id]

({'region_id': 932368,
 'width': 239,
 'height': 63,
 'image_id': 2396322,
 'phrase': 'large RON LEE letters on wall',
 'y': 215,
 'x': 150},
 {'id': 707489,
 'iscrowd': 0,
 'area': 15057,
 'image_id': 2396322,
 'category_id': 1,
 'bbox': [150, 215, 239, 63],
 'caption': 'large ron lee letters on wall'})

In [21]:
img_id = 2414939; region_id = 30
densecap_img_id_to_region[img_id][region_id], grit_test_img_id_to_regions[img_id][region_id]

({'region_id': 45202,
 'width': 150,
 'height': 70,
 'image_id': 2414939,
 'phrase': 'a sign with Spanish words.',
 'y': 91,
 'x': 341},
 {'id': 4806,
 'iscrowd': 0,
 'area': 9453,
 'image_id': 2414939,
 'category_id': 1,
 'bbox': [117, 61, 137, 69],
 'caption': 'two men stand near the grass'})

In [22]:
for i in grit_train_annot["images"]:
 assert i["id"] == int(i["file_name"].split(".")[0])
for i in grit_test_annot["images"]:
 assert i["id"] == int(i["file_name"].split(".")[0])

In [23]:
for i in densecap_image_meta:
 assert i["image_id"] == int(i["url"].split("/")[-1].split(".")[0])