Spaces:
				
			
			
	
			
			
		Runtime error
		
	
	
	
			
			
	
	
	
	
		
		
		Runtime error
		
	File size: 3,858 Bytes
			
			| 22a452a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 | import os
import random
import torch
import torchvision.transforms as transforms
from PIL import Image
def recalculate_box_and_verify_if_valid(x, y, w, h, image_size, original_image_size, min_box_size):
    scale = image_size / min(original_image_size)
    crop_y = (original_image_size[1] * scale - image_size) // 2
    crop_x = (original_image_size[0] * scale - image_size) // 2
    x0 = max(x * scale - crop_x, 0)
    y0 = max(y * scale - crop_y, 0)
    x1 = min((x + w) * scale - crop_x, image_size)
    y1 = min((y + h) * scale - crop_y, image_size)
    if (x1 - x0) * (y1 - y0) / (image_size * image_size) < min_box_size:
        return False, (None, None, None, None)
    return True, (x0, y0, x1, y1)
class COCODataset(torch.utils.data.Dataset):
    def __init__(
        self,
        data_path,
        image_path,
        image_size=512,
        min_box_size=0.01,
        max_boxes_per_data=8,
        tokenizer=None,
    ):
        super().__init__()
        self.min_box_size = min_box_size
        self.max_boxes_per_data = max_boxes_per_data
        self.image_size = image_size
        self.image_path = image_path
        self.tokenizer = tokenizer
        self.transforms = transforms.Compose(
            [
                transforms.Resize(image_size, interpolation=transforms.InterpolationMode.BILINEAR),
                transforms.CenterCrop(image_size),
                transforms.ToTensor(),
                transforms.Normalize([0.5], [0.5]),
            ]
        )
        self.data_list = torch.load(data_path, map_location="cpu")
    def __getitem__(self, index):
        if self.max_boxes_per_data > 99:
            assert False, "Are you sure setting such large number of boxes per image?"
        out = {}
        data = self.data_list[index]
        image = Image.open(os.path.join(self.image_path, data["file_path"])).convert("RGB")
        original_image_size = image.size
        out["pixel_values"] = self.transforms(image)
        annos = data["annos"]
        areas, valid_annos = [], []
        for anno in annos:
            # x, y, w, h = anno['bbox']
            x0, y0, x1, y1 = anno["bbox"]
            x, y, w, h = x0, y0, x1 - x0, y1 - y0
            valid, (x0, y0, x1, y1) = recalculate_box_and_verify_if_valid(
                x, y, w, h, self.image_size, original_image_size, self.min_box_size
            )
            if valid:
                anno["bbox"] = [x0, y0, x1, y1]
                areas.append((x1 - x0) * (y1 - y0))
                valid_annos.append(anno)
        # Sort according to area and choose the largest N objects
        wanted_idxs = torch.tensor(areas).sort(descending=True)[1]
        wanted_idxs = wanted_idxs[: self.max_boxes_per_data]
        valid_annos = [valid_annos[i] for i in wanted_idxs]
        out["boxes"] = torch.zeros(self.max_boxes_per_data, 4)
        out["masks"] = torch.zeros(self.max_boxes_per_data)
        out["text_embeddings_before_projection"] = torch.zeros(self.max_boxes_per_data, 768)
        for i, anno in enumerate(valid_annos):
            out["boxes"][i] = torch.tensor(anno["bbox"]) / self.image_size
            out["masks"][i] = 1
            out["text_embeddings_before_projection"][i] = anno["text_embeddings_before_projection"]
        prob_drop_boxes = 0.1
        if random.random() < prob_drop_boxes:
            out["masks"][:] = 0
        caption = random.choice(data["captions"])
        prob_drop_captions = 0.5
        if random.random() < prob_drop_captions:
            caption = ""
        caption = self.tokenizer(
            caption,
            max_length=self.tokenizer.model_max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        out["caption"] = caption
        return out
    def __len__(self):
        return len(self.data_list)
 | 
