Spaces:

Pinwheel
/

GLIP-BLIP-Object-Detection-VQA

Runtime error

App Files Files Community

GLIP-BLIP-Object-Detection-VQA / maskrcnn_benchmark /data /build.py

Pinwheel

HF Demo

128757a about 2 years ago

raw

history blame

23.1 kB

	# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
	import bisect
	import copy
	import logging
	import os

	import torch.utils.data
	import torch.distributed as dist
	from maskrcnn_benchmark.utils.comm import get_world_size
	from maskrcnn_benchmark.utils.imports import import_file

	from . import datasets as D
	from . import samplers

	from .collate_batch import BatchCollator, BBoxAugCollator
	from .transforms import build_transforms

	from transformers import AutoTokenizer
	from .datasets.duplicate_dataset import create_duplicate_dataset

	def build_dataset(cfg, dataset_list, transforms, dataset_catalog, is_train=True, class_concat=False, extra_args={}):
	"""
	Arguments:
	dataset_list (list[str]): Contains the names of the datasets, i.e.,
	coco_2014_trian, coco_2014_val, etc
	transforms (callable): transforms to apply to each (image, target) sample
	dataset_catalog (DatasetCatalog): contains the information on how to
	construct a dataset.
	is_train (bool): whether to setup the dataset for training or testing
	"""
	if not isinstance(dataset_list, (list, tuple)):
	raise RuntimeError(
	"dataset_list should be a list of strings, got {}".format(dataset_list)
	)
	datasets = []
	num_category = 1
	for dataset_id, dataset_name in enumerate(dataset_list, 1):
	if is_train:
	dataset_name = dataset_name + cfg.DATASETS.TRAIN_DATASETNAME_SUFFIX
	else:
	dataset_name = dataset_name + cfg.DATASETS.TEST_DATASETNAME_SUFFIX
	data = dataset_catalog.get(dataset_name)
	factory = getattr(D, data["factory"])
	args = data["args"]
	# for COCODataset, we want to remove images without annotations
	# during training
	if data["factory"] == "COCODataset":
	args["remove_images_without_annotations"] = is_train

	if data["factory"] == "PascalVOCDataset":
	args["use_difficult"] = not is_train
	if data["factory"] in ["VGTSVDataset", "CocoDetectionTSV", "ODTSVDataset"]:
	args["extra_fields"] = ["class"]
	if cfg.MODEL.MASK_ON:
	args["extra_fields"].append("mask")

	if data["factory"] in ["CocoGrounding", "CocoDetectionTSV", "CaptionTSV", "MixedDataset", "FlickrDataset", "RefExpDataset", "GQADataset", "PseudoData", "PhrasecutDetection"]:
	# args["return_masks"] = False
	args["return_masks"] = cfg.MODEL.MASK_ON
	args["return_tokens"] = True
	args["max_num_labels"] = cfg.TEST.MDETR_STYLE_AGGREGATE_CLASS_NUM
	args["max_query_len"] = cfg.MODEL.LANGUAGE_BACKBONE.MAX_QUERY_LEN

	args["transforms"] = transforms
	args.update(extra_args)

	if dataset_name == "flickr30k_train":
	copy = cfg.DATASETS.FLICKR_COPY
	elif dataset_name in ["mixed_train", "mixed_train_no_coco"]:
	copy = cfg.DATASETS.MIXED_COPY
	elif dataset_name == "COCO_odinw_train_8copy_dt_train":
	copy = cfg.DATASETS.COCO_COPY
	elif dataset_name == "LVIS_odinw_train_8copy_dt_train":
	copy = cfg.DATASETS.LVIS_COPY
	elif dataset_name == "object365_odinw_2copy_dt_train":
	copy = cfg.DATASETS.OBJECT365_COPY
	elif dataset_name == "vg_odinw_clipped_8copy_dt_train":
	copy = cfg.DATASETS.VG_COPY
	elif dataset_name == "vg_vgoi6_clipped_8copy_dt_train":
	copy = cfg.DATASETS.VG_COPY
	elif dataset_name == "imagenetod_train_odinw_2copy_dt":
	copy = cfg.DATASETS.IN_COPY
	elif dataset_name == "oi_train_odinw_dt":
	copy = cfg.DATASETS.OI_COPY
	elif is_train:
	copy = cfg.DATASETS.GENERAL_COPY
	elif not is_train:
	copy = cfg.DATASETS.GENERAL_COPY_TEST
	else:
	copy = -1 # do not ever copy test

	if copy != -1:
	new_factory = create_duplicate_dataset(factory)
	dataset = new_factory(copy=copy, **args)
	else:
	# make dataset from factory
	dataset = factory(**args)

	print(dataset_name, 'has the {} data points'.format(len(dataset)), data["factory"])

	if class_concat:
	category = list(dataset.contiguous_category_id_to_json_id.values())
	dataset.contiguous_category_id_to_json_id = {}
	dataset.json_category_id_to_contiguous_id = {}
	for id, cat in enumerate(category, start=num_category):
	dataset.json_category_id_to_contiguous_id[cat] = id
	dataset.contiguous_category_id_to_json_id[id] = cat
	num_category += len(category)
	print("Found {} #category after group {}, concating ...".format(num_category, dataset_id))
	datasets.append(dataset)

	# for testing, return a list of datasets
	if not is_train:
	return datasets

	# for training, concatenate all datasets into a single one
	dataset = datasets[0]
	if len(datasets) > 1:
	dataset = D.ConcatDataset(datasets)

	return [dataset]


	def build_dataset_by_group(dataset_list, transforms, dataset_catalog, is_train=True, class_by_group=True,
	class_concat=False, extra_args={}):
	"""
	Arguments:
	dataset_list (list[str]): Contains the names of the datasets, i.e.,
	coco_2014_trian, coco_2014_val, etc
	transforms (callable): transforms to apply to each (image, target) sample
	dataset_catalog (DatasetCatalog): contains the information on how to
	construct a dataset.
	is_train (bool): whether to setup the dataset for training or testing
	"""
	if not isinstance(dataset_list, (list, tuple)):
	raise RuntimeError(
	"dataset_list should be a list of strings, got {}".format(dataset_list)
	)

	num_category = 1
	grouped_datasets = []
	for group_id, group in enumerate(dataset_list, 1):
	datasets = []
	for dataset_name in group:
	data = dataset_catalog.get(dataset_name)
	factory = getattr(D, data["factory"])
	args = data["args"]
	# for COCODataset, we want to remove images without annotations
	# during training
	if data["factory"] == "COCODataset":
	args["remove_images_without_annotations"] = is_train
	if data["factory"] == "PascalVOCDataset":
	args["use_difficult"] = not is_train
	args["transforms"] = transforms
	args.update(extra_args)
	# make dataset from factory
	dataset = factory(**args)

	# check if dataset is grouped by task, assume one class per task
	if class_by_group and data["factory"] != "Background":
	category = dataset.contiguous_category_id_to_json_id[1]
	del dataset.contiguous_category_id_to_json_id[1]
	dataset.json_category_id_to_contiguous_id[category] = group_id
	dataset.contiguous_category_id_to_json_id[group_id] = category

	datasets.append(dataset)

	if class_concat:
	for dataset in datasets:
	category = list(dataset.contiguous_category_id_to_json_id.values())
	dataset.contiguous_category_id_to_json_id = {}
	dataset.json_category_id_to_contiguous_id = {}
	for id, cat in enumerate(category, start=num_category):
	dataset.json_category_id_to_contiguous_id[cat] = id
	dataset.contiguous_category_id_to_json_id[id] = cat
	num_category += len(category)
	print("Found {} #category after group {}, concating ...".format(num_category, group_id))

	if is_train:
	datasets = D.ConcatDataset(datasets)

	grouped_datasets.append(datasets)

	# for testing, return a list of datasets
	if not is_train:
	datasets = [dataset for group in grouped_datasets for dataset in group]
	return datasets
	if class_concat:
	grouped_datasets = D.ConcatDataset(grouped_datasets)
	return [grouped_datasets]

	# for training, concatenate all datasets into a single one
	return grouped_datasets


	def make_data_sampler(dataset, shuffle, distributed, num_replicas=None, rank=None, use_random_seed=True):
	if distributed:
	return samplers.DistributedSampler(dataset, shuffle=shuffle, num_replicas=num_replicas, rank=rank,
	use_random=use_random_seed)
	if shuffle:
	sampler = torch.utils.data.sampler.RandomSampler(dataset)
	else:
	sampler = torch.utils.data.sampler.SequentialSampler(dataset)
	return sampler


	def _quantize(x, bins):
	bins = copy.copy(bins)
	bins = sorted(bins)
	quantized = list(map(lambda y: bisect.bisect_right(bins, y), x))
	return quantized


	def _compute_aspect_ratios(dataset):
	aspect_ratios = []
	for i in range(len(dataset)):
	img_info = dataset.get_img_info(i)
	aspect_ratio = float(img_info["height"]) / float(img_info["width"])
	aspect_ratios.append(aspect_ratio)
	return aspect_ratios


	def make_batch_data_sampler(
	dataset, sampler, aspect_grouping, images_per_batch, num_iters=None, start_iter=0, drop_last=False
	):
	if aspect_grouping:
	if not isinstance(aspect_grouping, (list, tuple)):
	aspect_grouping = [aspect_grouping]
	aspect_ratios = _compute_aspect_ratios(dataset)
	group_ids = _quantize(aspect_ratios, aspect_grouping)
	batch_sampler = samplers.GroupedBatchSampler(
	sampler, group_ids, images_per_batch, drop_uneven=drop_last
	)
	else:
	batch_sampler = torch.utils.data.sampler.BatchSampler(
	sampler, images_per_batch, drop_last=drop_last
	)
	if num_iters is not None:
	batch_sampler = samplers.IterationBasedBatchSampler(
	batch_sampler, num_iters, start_iter
	)
	return batch_sampler

	def make_data_loader(cfg, is_train=True, is_distributed=False, num_replicas=None, rank=None, start_iter=0):
	num_gpus = num_replicas or get_world_size()

	if is_train:
	images_per_batch = cfg.SOLVER.IMS_PER_BATCH
	assert (
	images_per_batch % num_gpus == 0
	), "SOLVER.IMS_PER_BATCH ({}) must be divisible by the number "
	"of GPUs ({}) used.".format(images_per_batch, num_gpus)
	images_per_gpu = images_per_batch // num_gpus
	shuffle = True
	num_iters = cfg.SOLVER.MAX_ITER
	else:
	images_per_batch = cfg.TEST.IMS_PER_BATCH
	assert (
	images_per_batch % num_gpus == 0
	), "TEST.IMS_PER_BATCH ({}) must be divisible by the number "
	"of GPUs ({}) used.".format(images_per_batch, num_gpus)
	images_per_gpu = images_per_batch // num_gpus
	shuffle = False if not is_distributed else True
	num_iters = None
	start_iter = 0

	if images_per_gpu > 1:
	logger = logging.getLogger(__name__)
	logger.warning(
	"When using more than one image per GPU you may encounter "
	"an out-of-memory (OOM) error if your GPU does not have "
	"sufficient memory. If this happens, you can reduce "
	"SOLVER.IMS_PER_BATCH (for training) or "
	"TEST.IMS_PER_BATCH (for inference). For training, you must "
	"also adjust the learning rate and schedule length according "
	"to the linear scaling rule. See for example: "
	"https://github.com/facebookresearch/Detectron/blob/master/configs/getting_started/tutorial_1gpu_e2e_faster_rcnn_R-50-FPN.yaml#L14"
	)

	# group images which have similar aspect ratio. In this case, we only
	# group in two cases: those with width / height > 1, and the other way around,
	# but the code supports more general grouping strategy
	aspect_grouping = [1] if cfg.DATALOADER.ASPECT_RATIO_GROUPING else []

	paths_catalog = import_file(
	"maskrcnn_benchmark.config.paths_catalog", cfg.PATHS_CATALOG, True
	)

	DatasetCatalog = paths_catalog.DatasetCatalog
	if len(cfg.DATASETS.REGISTER) > 0:
	for new_dataset in cfg.DATASETS.REGISTER:
	# img_dir = cfg.DATASETS.REGISTER[new_dataset]["img_dir"]
	# if "ann_file" in cfg.DATASETS.REGISTER[new_dataset]:
	# ann_file = cfg.DATASETS.REGISTER[new_dataset]["ann_file"]
	# else:
	# ann_file = None
	attrs = dict(cfg.DATASETS.REGISTER[new_dataset])
	if is_train:
	new_dataset = new_dataset + cfg.DATASETS.TRAIN_DATASETNAME_SUFFIX
	else:
	new_dataset = new_dataset + cfg.DATASETS.TEST_DATASETNAME_SUFFIX
	DatasetCatalog.set(new_dataset, attrs)


	dataset_list = cfg.DATASETS.TRAIN if is_train else cfg.DATASETS.TEST

	# Haotian: expand bing dataset
	if "bing_caption_train" in dataset_list and len(cfg.DATASETS.BING_INDEX_LIST) > 0:
	dataset_list = list(dataset_list)
	dataset_list.remove("bing_caption_train")
	for bing_index in cfg.DATASETS.BING_INDEX_LIST:
	dataset_list.insert(len(dataset_list), "bing_caption_{}_train".format(bing_index))
	dataset_list = tuple(dataset_list)

	if "bing_caption_train_no_coco" in dataset_list and len(cfg.DATASETS.BING_INDEX_LIST) > 0:
	dataset_list = list(dataset_list)
	dataset_list.remove("bing_caption_train_no_coco")
	for bing_index in cfg.DATASETS.BING_INDEX_LIST:
	dataset_list.insert(len(dataset_list), "bing_caption_{}_train_no_coco".format(bing_index))
	dataset_list = tuple(dataset_list)

	print("The combined datasets are: {}.".format(dataset_list))

	transforms = None if not is_train and cfg.TEST.USE_MULTISCALE else build_transforms(cfg, is_train)

	extra_args = {}
	if is_train and cfg.DATASETS.USE_CROWD:
	extra_args['ignore_crowd'] = False
	if is_train and cfg.DATASETS.MAX_BOX > 0:
	extra_args['max_box'] = cfg.DATASETS.MAX_BOX
	if is_train and cfg.DATASETS.FEW_SHOT>0:
	extra_args['few_shot'] = cfg.DATASETS.FEW_SHOT
	if is_train and cfg.DATASETS.SHUFFLE_SEED != 0:
	extra_args['shuffle_seed'] = cfg.DATASETS.SHUFFLE_SEED

	# od to grounding
	if is_train and cfg.DATASETS.RANDOM_SAMPLE_NEG > 0:
	extra_args['random_sample_negative'] = cfg.DATASETS.RANDOM_SAMPLE_NEG
	if is_train and cfg.DATASETS.ADD_DET_PROMPT:
	extra_args["add_detection_prompt"] = True
	if is_train and cfg.DATASETS.USE_OD_AUG:
	extra_args["use_od_data_aug"] = True
	if is_train and cfg.DATASETS.DISABLE_SHUFFLE:
	extra_args["disable_shuffle"] = True
	if cfg.DATASETS.ONE_HOT:
	extra_args["one_hot"] = True
	if is_train and len(cfg.DATASETS.PROMPT_VERSION) > 0:
	extra_args["prompt_engineer_version"] = cfg.DATASETS.PROMPT_VERSION
	if is_train and len(cfg.DATASETS.CONTROL_PROB) == 4:
	extra_args["control_probabilities"] = cfg.DATASETS.CONTROL_PROB
	if is_train and cfg.DATASETS.DISABLE_CLIP_TO_IMAGE:
	extra_args["disable_clip_to_image"] = cfg.DATASETS.DISABLE_CLIP_TO_IMAGE
	if is_train and cfg.DATASETS.NO_MINUS_ONE_FOR_ONE_HOT:
	extra_args["no_minus_one_for_one_hot"] = cfg.DATASETS.NO_MINUS_ONE_FOR_ONE_HOT
	if is_train:
	extra_args["separation_tokens"] = cfg.DATASETS.SEPARATION_TOKENS
	# caption
	if is_train and cfg.DATASETS.CAPTION_MIN_BOX > 0:
	extra_args["caption_min_box"] = cfg.DATASETS.CAPTION_MIN_BOX
	if is_train and cfg.DATASETS.REPLACE_CLEAN_LABEL:
	extra_args["replace_clean_label"] = True
	if is_train and cfg.DATASETS.FURTHER_SCREEN:
	extra_args["further_screen"] = True
	if is_train and cfg.DATASETS.CAPTION_CONF > 0.0:
	extra_args["caption_conf"] = cfg.DATASETS.CAPTION_CONF
	if is_train:
	extra_args["caption_nms"] = cfg.DATASETS.CAPTION_NMS
	if is_train and cfg.DATASETS.PACK_RANDOM_CAPTION_NUMBER > 0:
	extra_args["pack_random_caption_number"] = cfg.DATASETS.PACK_RANDOM_CAPTION_NUMBER
	if is_train and cfg.DATASETS.INFERENCE_CAPTION:
	extra_args["inference_caption"] = True
	if is_train and cfg.DATASETS.SAMPLE_NEGATIVE_FOR_GROUNDING_DATA > 0:
	extra_args["sample_negative_for_grounding_data"] = cfg.DATASETS.SAMPLE_NEGATIVE_FOR_GROUNDING_DATA
	if is_train and cfg.DATASETS.RANDOM_PACK_PROB > 0:
	extra_args["random_pack_prob"] = cfg.DATASETS.RANDOM_PACK_PROB
	if is_train and cfg.DATASETS.NO_RANDOM_PACK_PROBABILITY > 0:
	extra_args["no_random_pack_probability"] = cfg.DATASETS.NO_RANDOM_PACK_PROBABILITY
	if is_train:
	extra_args["safeguard_positive_caption"] = cfg.DATASETS.SAFEGUARD_POSITIVE_CAPTION
	if is_train:
	extra_args["local_debug"] = cfg.DATASETS.LOCAL_DEBUG
	if is_train:
	extra_args["no_mask_for_od"] = cfg.MODEL.DYHEAD.FUSE_CONFIG.NO_MASK_FOR_OD
	if is_train:
	extra_args["no_mask_for_gold"] = cfg.MODEL.DYHEAD.FUSE_CONFIG.NO_MASK_FOR_GOLD
	if is_train:
	extra_args["mlm_obj_for_only_positive"] = cfg.MODEL.DYHEAD.FUSE_CONFIG.MLM_OBJ_FOR_ONLY_POSITIVE
	if cfg.DATASETS.OVERRIDE_CATEGORY and cfg.DATASETS.USE_OVERRIDE_CATEGORY:
	extra_args["override_category"] = cfg.DATASETS.OVERRIDE_CATEGORY
	if is_train:
	extra_args["caption_format_version"] = cfg.DATASETS.CAPTION_FORMAT_VERSION
	if is_train:
	extra_args["special_safeguard_for_coco_grounding"] = cfg.DATASETS.SPECIAL_SAFEGUARD_FOR_COCO_GROUNDING
	if is_train:
	extra_args["diver_box_for_vqa"] = cfg.DATASETS.DIVER_BOX_FOR_VQA
	extra_args["caption_prompt"] = cfg.DATASETS.CAPTION_PROMPT
	extra_args["use_caption_prompt"] = cfg.DATASETS.USE_CAPTION_PROMPT

	# extra_args['tokenizer'] = AutoTokenizer.from_pretrained(cfg.MODEL.LANGUAGE_BACKBONE.TOKENIZER_TYPE)
	if cfg.MODEL.LANGUAGE_BACKBONE.TOKENIZER_TYPE == "clip":
	# extra_args['tokenizer'] = build_tokenizer("clip")
	from transformers import CLIPTokenizerFast
	if cfg.MODEL.DYHEAD.FUSE_CONFIG.MLM_LOSS:
	extra_args["tokenizer"] = CLIPTokenizerFast.from_pretrained("openai/clip-vit-base-patch32", from_slow=True, mask_token='ðŁĴĳ</w>')
	else:
	extra_args["tokenizer"] = CLIPTokenizerFast.from_pretrained("openai/clip-vit-base-patch32", from_slow=True)
	else:
	extra_args['tokenizer'] = AutoTokenizer.from_pretrained(cfg.MODEL.LANGUAGE_BACKBONE.TOKENIZER_TYPE)

	if isinstance(dataset_list[0], (tuple, list)):
	datasets = build_dataset_by_group(dataset_list, transforms, DatasetCatalog, is_train,
	class_by_group=cfg.DATASETS.ALTERNATIVE_TRAINING,
	class_concat=cfg.DATASETS.CLASS_CONCAT,
	extra_args=extra_args)
	else:
	datasets = build_dataset(cfg, dataset_list, transforms, DatasetCatalog, is_train,
	class_concat=cfg.DATASETS.CLASS_CONCAT,
	extra_args=extra_args)

	data_loaders = []
	for di, dataset in enumerate(datasets):
	if is_train and cfg.SOLVER.MAX_EPOCH > 0:
	num_iters = cfg.SOLVER.MAX_EPOCH * len(dataset) // cfg.SOLVER.IMS_PER_BATCH
	print("Number of iterations are {}".format(num_iters))
	cfg.defrost()
	cfg.SOLVER.MAX_ITER = num_iters
	cfg.SOLVER.DATASET_LENGTH = len(dataset)
	cfg.freeze()
	if is_train and cfg.SOLVER.MULTI_MAX_EPOCH:
	num_iters = None
	cfg.defrost()
	cfg.SOLVER.MULTI_MAX_ITER += (cfg.SOLVER.MULTI_MAX_EPOCH[di] * len(dataset) // cfg.SOLVER.IMS_PER_BATCH,)
	cfg.freeze()

	if is_train and cfg.DATALOADER.DISTRIBUTE_CHUNK_AMONG_NODE:
	from .datasets.custom_distributed_sampler import DistributedSamplerChunkByNode
	chunk_or_not = []
	for i in dataset_list:
	if "bing_caption" in i:
	chunk_or_not.append(True)
	else:
	chunk_or_not.append(False)
	assert(len(chunk_or_not) == len(dataset.datasets))
	'''
	If we are training on 4 nodes, each with 8 GPUs
	'''
	num_nodes = int(os.getenv('NODE_COUNT', os.getenv('OMPI_COMM_WORLD_SIZE', 1)))
	local_size = cfg.num_gpus//num_nodes
	node_rank = int(os.getenv('NODE_RANK', os.getenv('OMPI_COMM_WORLD_RANK', 0)))
	local_rank = cfg.local_rank
	sampler = DistributedSamplerChunkByNode(
	dataset = dataset,
	all_datasets = dataset.datasets, # Assumming dataset is a ConcateDataset instance,
	chunk_or_not = chunk_or_not,
	num_replicas = cfg.num_gpus, # total GPU number, e.g., 32
	rank = dist.get_rank(), # Global Rank, e.g., 0~31
	node_rank = node_rank, # Node Rank, e.g., 0~3
	node_number = num_nodes, # how many node e.g., 4
	process_num_per_node = local_size, # e.g., 8
	rank_within_local_node = local_rank, # e.g., 0~7
	)
	else:
	sampler = make_data_sampler(dataset, shuffle, is_distributed, num_replicas=num_replicas, rank=rank,
	use_random_seed=cfg.DATALOADER.USE_RANDOM_SEED)
	batch_sampler = make_batch_data_sampler(
	dataset, sampler, aspect_grouping, images_per_gpu, num_iters, start_iter, drop_last=is_train
	)
	collator = BBoxAugCollator() if not is_train and cfg.TEST.USE_MULTISCALE else BatchCollator(
	cfg.DATALOADER.SIZE_DIVISIBILITY)
	num_workers = cfg.DATALOADER.NUM_WORKERS
	data_loader = torch.utils.data.DataLoader(
	dataset,
	num_workers=num_workers,
	batch_sampler=batch_sampler,
	collate_fn=collator,
	)
	data_loaders.append(data_loader)
	if is_train and cfg.SOLVER.MULTI_MAX_EPOCH:
	cfg.defrost()
	cfg.SOLVER.MULTI_MAX_ITER += (
	cfg.SOLVER.MULTI_MAX_EPOCH[-1] * min([len(dataset) // cfg.SOLVER.IMS_PER_BATCH for dataset in datasets]),)
	cfg.freeze()

	if is_train and not cfg.DATASETS.ALTERNATIVE_TRAINING and not cfg.DATASETS.MULTISTAGE_TRAINING:
	# during training, a single (possibly concatenated) data_loader is returned
	assert len(data_loaders) == 1
	return data_loaders[0]

	return data_loaders