Spaces:

zdou0830
/

desco

Sleeping

File size: 16,028 Bytes

749745d

import torch
import torch.distributed as dist
import time
from torchvision.ops import nms
import random
import numpy as np
from PIL import Image, ImageDraw
import pdb
from maskrcnn_benchmark.structures.bounding_box import BoxList
from .modulated_coco import ConvertCocoPolysToMask
from .tsv import ODTSVDataset, TSVYamlDataset
from .od_to_grounding import sanity_check_target_after_processing
from maskrcnn_benchmark.data.datasets._caption_aug import CaptionAugmentation
from collections import defaultdict

class CaptionTSV(TSVYamlDataset):
    def __init__(

        self,

        yaml_file,

        transforms,

        return_tokens,

        return_masks,

        tokenizer,

        caption_min_box=1,

        replace_clean_label=False,

        further_screen=False,

        caption_conf=0.5,

        caption_nms=-1,

        pack_random_caption_number=0,

        inference_caption=False,

        sample_negative_for_grounding_data=-1,

        random_pack_prob=-1.0,

        no_random_pack_probability=0.0,

        safeguard_positive_caption=True,

        mlm_obj_for_only_positive=False,

        caption_format_version="v1",

        local_debug=False,

        max_query_len=256,

        cc_caption_augmentation_version=None,

        caption_vocab_file=None,

        **kwargs

    ):
        super(CaptionTSV, self).__init__(yaml_file, None, replace_clean_label)
        self.yaml_file = yaml_file
        self._transforms = transforms
        self.max_query_len = 225
        self.prepare = ConvertCocoPolysToMask(
            return_masks=return_masks, return_tokens=return_tokens, tokenizer=tokenizer, max_query_len=max_query_len
        )
        self.tokenizer = tokenizer
        self.caption_min_box = caption_min_box
        self.replace_clean_label = replace_clean_label
        self.further_screen = further_screen
        self.pack_random_caption_number = pack_random_caption_number
        self.caption_format_version = caption_format_version

        self.caption_conf = caption_conf
        self.caption_nms = caption_nms
        self.inference_caption = inference_caption
        self.sample_negative_for_grounding_data = sample_negative_for_grounding_data
        self.random_pack_prob = random_pack_prob
        self.no_random_pack_probability = no_random_pack_probability
        self.safeguard_positive_caption = safeguard_positive_caption
        self.mlm_obj_for_only_positive = mlm_obj_for_only_positive
        try:
            self.rank = dist.get_rank()
        except:
            self.rank = 0
        self.caption_augmentation_version = cc_caption_augmentation_version
        if self.caption_augmentation_version is not None:
            self.caption_augmentation = CaptionAugmentation(
                self.caption_augmentation_version,
                tokenizer,
                caption_vocab_file=caption_vocab_file
            )

    def __len__(self):
        return super(CaptionTSV, self).__len__()

    def pack_caption(self, positive_caption, negative_captions, original_tokens_positive):
        if len(negative_captions) == 0:
            return positive_caption, original_tokens_positive, [(0, len(positive_caption))]
        if self.safeguard_positive_caption:
            length_of_each_caption = []
            for caption in negative_captions + [positive_caption]:
                tokenized = self.tokenizer(caption, return_tensors="pt")
                length_of_each_caption.append(tokenized.input_ids.size(-1))
            max_length = self.max_query_len - length_of_each_caption[-1]
            indexes = list(range(len(negative_captions)))
            random.shuffle(indexes)
            new_caption_list = [positive_caption]
            for i in indexes:
                if length_of_each_caption[i] < max_length:
                    new_caption_list.append(negative_captions[i])
                    max_length -= length_of_each_caption[i]
        else:
            new_caption_list = [positive_caption] + negative_captions
        random.shuffle(new_caption_list)

        new_caption = ""

        for i in new_caption_list:
            if i == positive_caption:
                start_position = len(new_caption)
            new_caption += i
            if not i.endswith("."):
                new_caption += "."
            new_caption += " "

        # shift the token positions the boxes are aligned to
        for index, i in enumerate(original_tokens_positive):
            original_tokens_positive[index] = [tuple(j) for j in i]
        for i in original_tokens_positive:
            for index, j in enumerate(i):
                i[index] = (j[0] + start_position, j[1] + start_position)

        return new_caption, original_tokens_positive, [(start_position, start_position + len(positive_caption))]

    def __get_negative_captions__(self, idx, negative_size=7):
        negative_captions = []
        for i in range(negative_size):
            img, anno, _, scale = super(CaptionTSV, self).__getitem__(np.random.choice(len(self)))
            caption = anno["caption"]
            negative_captions.append(caption)

        return negative_captions

    def target_transpose_in(self, anno):
            # for the target from "caption", we need to transpose to box format
            new_target = []
            for box in range(len(anno["bboxes"])):
                new_box = {}
                new_box["tokens_positive"] = anno["tokens_positive"][box]
                new_box["nouns"] = anno["all_nounds_in_vocab"][box]
                new_box["bbox"] = anno["bboxes"][box]
                new_target.append(new_box)
            return new_target

    def target_transpose_out(self, target):
        # for the target from "caption", we need to transpose to box format
        new_target = defaultdict(list)

        for box in target:
            new_target["bboxes"].append(box["bbox"])
            new_target["tokens_positive"].append(box["tokens_positive"])
            if "spans_positive" in box:
                new_target["spans_positive"].append(box["spans_positive"])
        return new_target

    def __getitem__(self, idx):
        try:
            img, anno, _, scale = super(CaptionTSV, self).__getitem__(idx)
            if self.inference_caption:
                caption = None
                if isinstance(anno, list):
                    caption = anno[0]["caption"]  # inference mode for bing
                    anno = []
                elif len(anno) == 1:
                    caption = anno["caption"]  # inference mode for googlecc
                    anno = []
                else:
                    caption = " ".join(anno["captions"])
                    anno = []
            else:
                """

                An example

                {'img_h': 1154, 'img_w': 1600, 'caption': 'xxx', 'tokens_positive': [[[47, 50], [51, 53], [54, 59]], [[32, 35], [36, 41]], [[32, 35], [36, 41]], [[0, 3], [3, 6], [6, 10], [11, 16], [17, 19], [20, 23]], [[32, 35], [36, 41]], [[32, 35], [36, 41]]], 'bboxes': [[7.344961166381836, 10.479412078857422, 1592.2679443359375, 1090.0028076171875], [950.32861328125, 346.572021484375, 1333.2373046875, 679.3215942382812], [927.44140625, 342.7712707519531, 1389.833984375, 719.5758666992188], [90.48786163330078, 363.67572021484375, 1381.8631591796875, 1078.687744140625], [122.84217071533203, 422.6786193847656, 507.845703125, 667.2651977539062], [80.62384033203125, 416.500244140625, 563.1666259765625, 734.603271484375]], 'scores': [0.7966700196266174, 0.8952182531356812, 0.8186006546020508, 0.9995516538619995, 0.8021856546401978, 0.8923134803771973]}

                """
                if len(anno["bboxes"]) < self.caption_min_box:  # Retry triggered!
                    return self[np.random.choice(len(self))]
                if self.caption_format_version == "v2":
                    anno = self.convert_anno_from_v2_to_v1(anno)

                if self.further_screen:
                    conf = self.caption_conf
                    nms_thre = self.caption_nms

                    bboxes = torch.as_tensor(anno["bboxes"]).float()
                    scores = torch.as_tensor(anno["scores"])
                    tokens_positive = anno["tokens_positive"]
                    if "all_nounds_in_vocab" in anno:
                        all_nounds_in_vocab = anno["all_nounds_in_vocab"]
                    else:
                        all_nounds_in_vocab = []
                    # print("\n\n\n\n tokens_positive in original data", tokens_positive)

                    keep = scores > conf
                    scores = scores[keep]
                    bboxes = bboxes[keep]
                    tokens_positive = [i for index, i in enumerate(tokens_positive) if keep[index]]
                    all_nounds_in_vocab = [i for index, i in enumerate(all_nounds_in_vocab) if keep[index]]

                    assert len(tokens_positive) == len(bboxes) == len(scores)

                    if len(bboxes) < self.caption_min_box:  # Retry triggered!
                        return self[np.random.choice(len(self))]

                    if nms_thre > 0:
                        keep = nms(boxes=bboxes, scores=scores, iou_threshold=nms_thre)
                        scores = scores[keep]
                        bboxes = bboxes[keep]
                        tokens_positive = [tokens_positive[i] for i in keep]
                        assert len(tokens_positive) == len(bboxes) == len(scores)

                    # Write back
                    anno["bboxes"] = bboxes.tolist()
                    anno["scores"] = scores.tolist()
                    anno["tokens_positive"] = tokens_positive
                    anno["all_nounds_in_vocab"] = all_nounds_in_vocab

                if len(anno["bboxes"]) < self.caption_min_box:  # Retry triggered!
                    return self[np.random.choice(len(self))]

                if self.caption_augmentation_version is not None:
                    caption, new_anno, spans = self.caption_augmentation(
                        anno["caption"],
                        self.target_transpose_in(anno),
                        gpt3_outputs = anno.get("gpt3_outputs", None))
                    anno.update(self.target_transpose_out(new_anno))
                    anno["caption"] = caption
                    do_neg_aug = False
                else:
                    do_neg_aug = True
                    spans = None

                boxes = torch.as_tensor(anno["bboxes"])
                caption = anno["caption"]
                target = BoxList(boxes, (anno["img_w"], anno["img_h"]), mode="xyxy")
                target = target.clip_to_image(remove_empty=True)
                if spans is not None:
                    target.add_field("spans", spans) # add spans to target
                #pdb.set_trace()
                # print("original caption", caption)
                empty_everything = False
                if self.sample_negative_for_grounding_data != -1:
                    if random.random() < self.sample_negative_for_grounding_data:
                        empty_everything = True

                if empty_everything:
                    caption = self.__get_negative_captions__(idx, negative_size=1)[0]

                if self.pack_random_caption_number != 0 and do_neg_aug:
                    if self.random_pack_prob != -1.0:
                        if random.random() < self.no_random_pack_probability:
                            negative_pack_number = 0
                        elif random.random() < self.random_pack_prob:
                            negative_pack_number = self.pack_random_caption_number
                        else:
                            negative_pack_number = np.random.choice(self.pack_random_caption_number)
                    else:
                        negative_pack_number = self.pack_random_caption_number

                    negative_captions = self.__get_negative_captions__(idx, negative_size=negative_pack_number)

                    caption, anno["tokens_positive"], greenlight_span_for_masked_lm_objective = self.pack_caption(
                        caption, negative_captions, anno["tokens_positive"]
                    )
                else:
                    greenlight_span_for_masked_lm_objective = [(0, len(caption))]

                if not self.mlm_obj_for_only_positive:
                    greenlight_span_for_masked_lm_objective = [(0, len(caption))]

                new_anno = []
                areas = target.area()
                for i in range(len(target)):
                    new_anno_i = {}
                    new_anno_i["area"] = areas[i]
                    new_anno_i["iscrowd"] = 0
                    new_anno_i["image_id"] = idx
                    new_anno_i["category_id"] = 1  # following vg and others
                    new_anno_i["id"] = None
                    new_anno_i["bbox"] = target.bbox[i].numpy().tolist()
                    new_anno_i["tokens_positive"] = anno["tokens_positive"][i]
                    if "spans_positive" in anno:
                        new_anno_i["spans_positive"] = anno["spans_positive"][i]
                    new_anno.append(new_anno_i)

                # except:
                #     return self[np.random.choice(len(self))]

                anno = new_anno
                if empty_everything:
                    anno = []

            annotations = {"image_id": idx, "annotations": anno, "caption": caption}
            annotations["greenlight_span_for_masked_lm_objective"] = greenlight_span_for_masked_lm_objective
            if "spans" in target.extra_fields:
                annotations["spans"] = target.extra_fields["spans"]
                if not isinstance(annotations["spans"], list):
                    annotations["spans"] = annotations["spans"].tolist()
            img, annotations = self.prepare(img, annotations, box_format="xyxy")
            if self._transforms is not None:
                img, target = self._transforms(img, target)

            # add additional property
            for ann in annotations:
                target.add_field(ann, annotations[ann])
        except:
            print("Outter Retry triggered!!")
            return self[np.random.choice(len(self))]

        return img, target, idx

    def convert_anno_from_v2_to_v1(self, anno):
        flatterned_bboxes = []
        flatterned_tokens_positive = []
        flatterned_bboxes_scores = []
        flatterned_nouns = []
        for i in range(len(anno["bboxes"])):
            # i is the index for entity
            for j in range(len(anno["bboxes"][i])):
                # j is the index for each box
                flatterned_bboxes.append(anno["bboxes"][i][j])
                flatterned_tokens_positive.append(
                    anno["tokens_positive"][i]
                )  # Assume this box corresponds to all the token_spans for this entity
                if "all_nounds_in_vocab" in anno:
                    flatterned_nouns.append(anno["all_nounds_in_vocab"][i])
                flatterned_bboxes_scores.append(anno["scores"][i][j])
        anno["bboxes"] = flatterned_bboxes
        anno["tokens_positive"] = flatterned_tokens_positive
        anno["scores"] = flatterned_bboxes_scores
        if "all_nounds_in_vocab" in anno:
            anno["all_nounds_in_vocab"] = flatterned_nouns
        return anno

    def get_raw_image(self, idx):
        image, *_ = super(CaptionTSV, self).__getitem__(idx)
        return image

    def get_img_id(self, idx):
        line_no = self.get_line_no(idx)
        if self.label_tsv is not None:
            row = self.label_tsv.seek(line_no)
            img_id = row[0]
            return img_id