""" Reads in a tsv file with pre-trained bottom up attention features and stores it in HDF5 format. Also store {image_id: feature_idx} as a pickle file. Hierarchy of HDF5 file: { 'image_features': num_images x num_boxes x 2048 array of features 'image_bb': num_images x num_boxes x 4 array of bounding boxes } """ from __future__ import print_function import os import sys import argparse sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) import base64 import csv import h5py # import cPickle import _pickle as cPickle import numpy as np import utils import tqdm csv.field_size_limit(sys.maxsize) FIELDNAMES = ['image_id', 'image_w', 'image_h', 'num_boxes', 'boxes', 'features'] def detection_features_converter(dataroot, ver, detector, feature_length, num_fixed_boxes): infile = os.path.join(dataroot, ver, "trainval_%s_%i.tsv"%(detector, num_fixed_boxes)) train_data_file = os.path.join(dataroot, ver, 'train_%s_%i.hdf5'%(detector, num_fixed_boxes)) val_data_file = os.path.join(dataroot, ver, 'val_%s_%i.hdf5'%(detector, num_fixed_boxes)) train_indices_file = os.path.join(dataroot, ver, 'train_%s_%i_imgid2idx.pkl'%(detector, num_fixed_boxes)) val_indices_file = os.path.join(dataroot, ver, 'val_%s_%i_imgid2idx.pkl'%(detector, num_fixed_boxes)) train_ids_file = os.path.join(dataroot, 'train_ids.pkl') val_ids_file = os.path.join(dataroot, 'val_ids.pkl') h_train = h5py.File(train_data_file, "w") h_val = h5py.File(val_data_file, "w") if os.path.exists(train_ids_file) and os.path.exists(val_ids_file): train_imgids = cPickle.load(open(train_ids_file, 'rb')) val_imgids = cPickle.load(open(val_ids_file, 'rb')) else: train_imgids = utils.load_imageid(os.path.join(dataroot, 'clean', 'train2014')) val_imgids = utils.load_imageid(os.path.join(dataroot, 'clean', 'val2014')) cPickle.dump(train_imgids, open(train_ids_file, 'wb')) cPickle.dump(val_imgids, open(val_ids_file, 'wb')) train_indices = {} val_indices = {} train_img_features = h_train.create_dataset( 'image_features', (len(train_imgids), num_fixed_boxes, feature_length), 'f') train_img_bb = h_train.create_dataset( 'image_bb', (len(train_imgids), num_fixed_boxes, 4), 'f') train_spatial_img_features = h_train.create_dataset( 'spatial_features', (len(train_imgids), num_fixed_boxes, 6), 'f') val_img_bb = h_val.create_dataset( 'image_bb', (len(val_imgids), num_fixed_boxes, 4), 'f') val_img_features = h_val.create_dataset( 'image_features', (len(val_imgids), num_fixed_boxes, feature_length), 'f') val_spatial_img_features = h_val.create_dataset( 'spatial_features', (len(val_imgids), num_fixed_boxes, 6), 'f') train_counter = 0 val_counter = 0 print("reading tsv...") # with open(infile, "r+b") as tsv_in_file: with open(infile, "r") as tsv_in_file: reader = csv.DictReader(tsv_in_file, delimiter='\t', fieldnames=FIELDNAMES) for item in tqdm.tqdm(reader): item['num_boxes'] = int(item['num_boxes']) image_id = int(item['image_id']) image_w = float(item['image_w']) image_h = float(item['image_h']) # bboxes = np.frombuffer( # base64.decodestring(item['boxes']), # dtype=np.float32).reshape((item['num_boxes'], -1)) bboxes = np.frombuffer( base64.b64decode(item['boxes']), dtype=np.float32).reshape((item['num_boxes'], -1)) box_width = bboxes[:, 2] - bboxes[:, 0] box_height = bboxes[:, 3] - bboxes[:, 1] scaled_width = box_width / image_w scaled_height = box_height / image_h scaled_x = bboxes[:, 0] / image_w scaled_y = bboxes[:, 1] / image_h box_width = box_width[..., np.newaxis] box_height = box_height[..., np.newaxis] scaled_width = scaled_width[..., np.newaxis] scaled_height = scaled_height[..., np.newaxis] scaled_x = scaled_x[..., np.newaxis] scaled_y = scaled_y[..., np.newaxis] spatial_features = np.concatenate( (scaled_x, scaled_y, scaled_x + scaled_width, scaled_y + scaled_height, scaled_width, scaled_height), axis=1) if image_id in train_imgids: train_imgids.remove(image_id) train_indices[image_id] = train_counter train_img_bb[train_counter, :, :] = bboxes # train_img_features[train_counter, :, :] = np.frombuffer( # base64.decodestring(item['features']), # dtype=np.float32).reshape((item['num_boxes'], -1)) train_img_features[train_counter, :, :] = np.frombuffer( base64.b64decode(item['features']), dtype=np.float32).reshape((item['num_boxes'], -1)) train_spatial_img_features[train_counter, :, :] = spatial_features train_counter += 1 elif image_id in val_imgids: val_imgids.remove(image_id) val_indices[image_id] = val_counter val_img_bb[val_counter, :, :] = bboxes # val_img_features[val_counter, :, :] = np.frombuffer( # base64.decodestring(item['features']), # dtype=np.float32).reshape((item['num_boxes'], -1)) val_img_features[val_counter, :, :] = np.frombuffer( base64.b64decode(item['features']), dtype=np.float32).reshape((item['num_boxes'], -1)) val_spatial_img_features[val_counter, :, :] = spatial_features val_counter += 1 else: assert False, 'Unknown image id: %d' % image_id if len(train_imgids) != 0: print('Warning: train_image_ids is not empty') if len(val_imgids) != 0: print('Warning: val_image_ids is not empty') cPickle.dump(train_indices, open(train_indices_file, 'wb')) cPickle.dump(val_indices, open(val_indices_file, 'wb')) # pickle.dump(train_indices, open(train_indices_file, 'w')) # pickle.dump(val_indices, open(val_indices_file, 'w')) h_train.close() h_val.close() print("done!") if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--dataroot', type=str, default='../data/') parser.add_argument('--ver', type=str, default='clean', help='version of the VQAv2 dataset to process. "clean" for the original data. default: clean') parser.add_argument('--detector', type=str, default='R-50') parser.add_argument('--feat', type=int, default=1024, help='feature size') parser.add_argument('--nb', type=int, default=36) args = parser.parse_args() detection_features_converter(args.dataroot, args.ver, args.detector, args.feat, args.nb)