Spaces:
Sleeping
Sleeping
import os | |
import os.path | |
from pathlib import Path | |
from typing import Any, Callable, Optional, Tuple | |
import torch | |
from maskrcnn_benchmark.structures.bounding_box import BoxList | |
import pdb | |
from PIL import Image, ImageDraw | |
from torchvision.datasets.vision import VisionDataset | |
from .modulated_coco import ConvertCocoPolysToMask, has_valid_annotation | |
from maskrcnn_benchmark.data.datasets._caption_aug import CaptionAugmentation | |
import numpy as np | |
class CustomCocoDetection(VisionDataset): | |
"""Coco-style dataset imported from TorchVision. | |
It is modified to handle several image sources | |
Args: | |
root_coco (string): Path to the coco images | |
root_vg (string): Path to the vg images | |
annFile (string): Path to json annotation file. | |
transform (callable, optional): A function/transform that takes in an PIL image | |
and returns a transformed version. E.g, ``transforms.ToTensor`` | |
target_transform (callable, optional): A function/transform that takes in the | |
target and transforms it. | |
transforms (callable, optional): A function/transform that takes input sample and its target as entry | |
and returns a transformed version. | |
""" | |
def __init__( | |
self, | |
root_coco: str, | |
root_vg: str, | |
annFile: str, | |
transform: Optional[Callable] = None, | |
target_transform: Optional[Callable] = None, | |
transforms: Optional[Callable] = None, | |
) -> None: | |
super(CustomCocoDetection, self).__init__(root_coco, transforms, transform, target_transform) | |
from pycocotools.coco import COCO | |
self.coco = COCO(annFile) | |
self.ids = list(sorted(self.coco.imgs.keys())) | |
ids = [] | |
for img_id in self.ids: | |
if isinstance(img_id, str): | |
ann_ids = self.coco.getAnnIds(imgIds=[img_id], iscrowd=None) | |
else: | |
ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=None) | |
anno = self.coco.loadAnns(ann_ids) | |
if has_valid_annotation(anno): | |
ids.append(img_id) | |
self.ids = ids | |
self.root_coco = root_coco | |
self.root_vg = root_vg | |
def __getitem__(self, index): | |
""" | |
Args: | |
index (int): Index | |
Returns: | |
tuple: Tuple (image, target). target is the object returned by ``coco.loadAnns``. | |
""" | |
coco = self.coco | |
img_id = self.ids[index] | |
ann_ids = coco.getAnnIds(imgIds=img_id) | |
target = coco.loadAnns(ann_ids) | |
img_info = coco.loadImgs(img_id)[0] | |
path = img_info["file_name"] | |
dataset = img_info["data_source"] | |
cur_root = self.root_coco if dataset == "coco" else self.root_vg | |
img = Image.open(os.path.join(cur_root, path)).convert("RGB") | |
if self.transforms is not None: | |
img, target = self.transforms(img, target) | |
return img, target | |
def __len__(self): | |
return len(self.ids) | |
class MixedDataset(CustomCocoDetection): | |
"""Same as the modulated detection dataset, except with multiple img sources""" | |
def __init__( | |
self, | |
img_folder_coco, | |
img_folder_vg, | |
ann_file, | |
transforms, | |
return_masks, | |
return_tokens, | |
tokenizer=None, | |
disable_clip_to_image=False, | |
no_mask_for_gold=False, | |
max_query_len=256, | |
caption_augmentation_version=None, | |
caption_vocab_file=None, | |
**kwargs | |
): | |
super(MixedDataset, self).__init__(img_folder_coco, img_folder_vg, ann_file) | |
self._transforms = transforms | |
self.max_query_len = max_query_len | |
self.prepare = ConvertCocoPolysToMask( | |
return_masks, return_tokens, tokenizer=tokenizer, max_query_len=max_query_len | |
) | |
self.id_to_img_map = {k: v for k, v in enumerate(self.ids)} | |
self.disable_clip_to_image = disable_clip_to_image | |
self.no_mask_for_gold = no_mask_for_gold | |
self.caption_augmentation_version = caption_augmentation_version | |
if self.caption_augmentation_version is not None: | |
self.caption_augmentation = CaptionAugmentation( | |
self.caption_augmentation_version, | |
tokenizer, | |
caption_vocab_file=caption_vocab_file | |
) | |
def __getitem__(self, idx): | |
#try: | |
img, target = super(MixedDataset, self).__getitem__(idx) | |
image_id = self.ids[idx] | |
__anno = self.coco.loadImgs(image_id)[0] | |
caption = __anno["caption"] | |
if self.caption_augmentation_version is not None: | |
caption, target, spans = self.caption_augmentation(caption, target, gpt3_outputs = __anno.get("gpt3_outputs", None)) | |
# print("augmented caption: ", caption) | |
# print("\n") | |
else: | |
spans = None | |
anno = {"image_id": image_id, "annotations": target, "caption": caption} | |
anno["greenlight_span_for_masked_lm_objective"] = [(0, len(caption))] | |
if self.no_mask_for_gold: | |
anno["greenlight_span_for_masked_lm_objective"].append((-1, -1, -1)) | |
img, anno = self.prepare(img, anno) | |
# convert to BoxList (bboxes, labels) | |
boxes = torch.as_tensor(anno["boxes"]).reshape(-1, 4) # guard against no boxes | |
target = BoxList(boxes, img.size, mode="xyxy") | |
classes = anno["labels"] | |
target.add_field("labels", classes) | |
# if spans is not None: | |
# target.add_field("spans", spans) # add spans to target | |
if not self.disable_clip_to_image: | |
num_boxes = len(boxes) | |
target = target.clip_to_image(remove_empty=True) | |
assert len(target.bbox) == num_boxes, "Box removed in MixedDataset!!!" | |
if self._transforms is not None: | |
img, target = self._transforms(img, target) | |
# add additional property | |
for ann in anno: | |
target.add_field(ann, anno[ann]) | |
return img, target, idx | |
# except: | |
# print("error in __getitem__ in mixed", idx) | |
# return self[np.random.choice(len(self))] | |
def get_img_info(self, index): | |
img_id = self.id_to_img_map[index] | |
img_data = self.coco.imgs[img_id] | |
return img_data | |