File size: 7,547 Bytes
032e687
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
from typing import Optional, Dict, Union, Tuple, List
from PIL import Image
import mmengine.fileio as fileio
from mmengine.logging import print_log
import io
from mmcv.transforms import LoadImageFromFile, BaseTransform
from xtuner.registry import BUILDER
from xtuner.utils.constants import IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN
import torch
import torch.nn.functional as F
import copy


class PILLoadImageFromFile(LoadImageFromFile):
    def __init__(self, **kwargs):
        backend_args = kwargs.pop('backend_args', None)
        super().__init__(backend_args=backend_args, **kwargs)

    def transform(self, results: dict) -> Optional[dict]:
        """Functions to load image.

        Args:
            results (dict): Result dict from
                :class:`mmengine.dataset.BaseDataset`.

        Returns:
            dict: The dict contains loaded image and meta information.
        """

        filename = results['img_path']
        try:
            if self.file_client_args is not None:
                file_client = fileio.FileClient.infer_client(
                    self.file_client_args, filename)
                img_bytes = file_client.get(filename)
            else:
                img_bytes = fileio.get(
                    filename, backend_args=self.backend_args)
            img = Image.open(io.BytesIO(img_bytes))
        except Exception as e:
            if self.ignore_empty:
                return None
            else:
                raise e
        # in some cases, images are not read successfully, the img would be
        # `None`, refer to https://github.com/open-mmlab/mmpretrain/issues/1427
        assert img is not None, f'failed to load image: {filename}'
        results['img'] = img
        results['img_shape'] = (img.height, img.width)
        results['ori_shape'] = (img.height, img.width)
        return results


class RefCOCO2PNG(BaseTransform):
    def __init__(self,
                 image_processor=None,
                 tokenizer=None,
                 prompt_template=None,
                 prompt='<image>\nWhat is shown in this image?',
                 concat=True,
                 image2tensor=True,
                 add_image_token=False,
                 image_token=DEFAULT_IMAGE_TOKEN):
        self.tokenizer = BUILDER.build(tokenizer)
        self.image_processor = BUILDER.build(image_processor)
        self.concat = concat
        self.image2tensor = image2tensor
        self.image_token = image_token

        self.add_image_token = add_image_token
        if add_image_token:
            print_log(f"Manually add image token: {self.image_token}")
            special_tokens_dict = {'additional_special_tokens': [self.image_token, ]}
            num_added_toks = self.tokenizer.add_special_tokens(special_tokens_dict)
            assert num_added_toks == 1

        self.image_token_idx = self.tokenizer.encode(self.image_token, add_special_tokens=False)[-1]
        print_log(f"Image token: {self.tokenizer.decode(self.image_token_idx)}")

        self.prompt = self.tokenizer.encode(
            prompt_template['INSTRUCTION'].format(input=prompt),
            add_special_tokens=True)
        self.prompt_template = prompt_template

    def transform(self, results):
        if self.concat:
            return self.transform_concat(results)
        else:
            return self.transform_split(results)

    def transform_split(self, results):
        all_results = []
        for inst_id, instant_text in enumerate(results['text']):
            new_results = copy.deepcopy(results)
            new_results['text'] = [instant_text]
            new_results['gt_masks'] = results['gt_masks'][inst_id:inst_id+1]
            all_results.append(self.transform_concat(new_results))

        return all_results

    def transform_concat(self, results: dict):

        caption_input_ids = []
        mask_ids = [-1] * len(self.prompt)
        split_token_id = self.tokenizer.encode('.', add_special_tokens=False)[-1]

        for inst_id, instant_text in enumerate(results['text']):
            segment_input_ids = self.tokenizer.encode(instant_text, add_special_tokens=False)
            caption_input_ids += segment_input_ids
            mask_ids += [inst_id] * len(segment_input_ids)

            caption_input_ids.append(split_token_id)
            mask_ids.append(-1)

        input_ids = self.prompt + caption_input_ids
        input_ids = torch.tensor(input_ids, dtype=torch.long)
        mask_ids = torch.tensor(mask_ids)

        image = results['img']
        image_data = self.image_processor.preprocess(image)

        pixel_values = image_data['pixel_values'][0]
        if self.image2tensor:
            pixel_values = torch.from_numpy(pixel_values)
        meta_data = image_data['meta_datas'][0]

        assert len(results['gt_masks'].masks) == len(results['text'])
        mask_cnt = len(results['text'])

        masks = torch.from_numpy(results['gt_masks'].masks).float()

        h, w = meta_data['image_shape']['height'], meta_data['image_shape']['width']
        gt_masks = masks.clone()
        masks = F.interpolate(masks[None], size=(h, w))[0]

        p_h, p_w = meta_data['padded_shape']['height'], meta_data['padded_shape']['width']

        padded_masks = torch.zeros(mask_cnt, p_h, p_w, dtype=masks.dtype)
        padding = meta_data['padding']

        padded_masks[:, padding['before_height']:p_h - padding['after_height'],
                        padding['before_width']:p_w - padding['after_width']] = masks

        # todo: add labels
        prompt_len = len(self.prompt)
        labels = torch.ones_like(input_ids) * IGNORE_INDEX
        labels[prompt_len:] = input_ids[prompt_len:]

        if self.add_image_token:
            input_ids[input_ids == self.image_token_idx] = IMAGE_TOKEN_INDEX

        return dict(input_ids=input_ids,
                    mask_ids=mask_ids,
                    pixel_values=pixel_values,
                    padded_masks=padded_masks,
                    masks=masks,  # shape is kept
                    gt_masks=gt_masks,
                    image_sizes=torch.tensor(image_data['image_sizes'][0]),
                    image=image,
                    meta_data=meta_data,
                    labels=labels)


if __name__ == '__main__':
    from mmdet.datasets import RefCocoDataset
    from mmengine.config import Config
    from mmdet.datasets.transforms import LoadAnnotations

    cfg = Config.fromfile('configs/fuyu/frozen_fuyu_8b_unet_sam_l_refcoco_png.py')
    prompt_template = cfg.prompt_template
    tokenizer = cfg.tokenizer
    image_processor = cfg.image_processor
    prompt = cfg.get('prompt', None)

    refcoco2png_params = dict(
        type=RefCOCO2PNG,
        image_processor=image_processor,
        tokenizer=tokenizer,
        prompt_template=prompt_template,

    )
    if prompt is not None:
        refcoco2png_params.update(prompt=prompt)

    test_pipeline = [
        dict(type=PILLoadImageFromFile, backend_args=None),
        dict(
            type=LoadAnnotations,
            with_mask=True,
            with_bbox=False,
            with_seg=False,
            with_label=False),
        refcoco2png_params
    ]

    dataset = RefCocoDataset(
        data_root='data/coco/',
        data_prefix=dict(img_path='train2014/'),
        text_mode='select_first',
        pipeline=test_pipeline,
        ann_file='refcoco/instances.json',
        split_file='refcoco/refs(unc).p',
        split='val'
    )


    for data in dataset:
        print(data.keys())