File size: 8,848 Bytes
e82dff2
 
b0c2018
3b61cce
 
 
 
 
 
45a5416
 
 
 
 
 
7617596
3b61cce
 
ad3ee60
3b61cce
 
 
a8797f1
 
 
53b7b42
a8797f1
53b7b42
ef365f5
bd0d673
3b61cce
 
 
0312353
 
3b61cce
1aa8228
 
19327c9
1aa8228
3b61cce
 
 
 
0312353
3b61cce
 
 
0312353
3b61cce
 
b0c2018
 
 
 
 
 
 
 
 
 
 
 
45a5416
 
 
 
3b61cce
0312353
45a5416
 
335898c
 
 
0312353
45a5416
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
359b3f0
45a5416
 
 
 
 
 
 
 
7617596
 
 
 
 
 
 
fb94f78
7617596
6eaf487
7617596
fb94f78
 
7617596
0312353
fb14311
fb94f78
073a46b
6eaf487
 
 
 
 
 
fb94f78
6eaf487
53b7b42
6eaf487
 
fb94f78
ef365f5
 
073a46b
6eaf487
 
 
 
 
 
fb94f78
6eaf487
539688d
6eaf487
 
53b7b42
fb94f78
ef365f5
 
6eaf487
7617596
45a5416
7617596
 
 
45a5416
0312353
7617596
fb94f78
3b61cce
7617596
45a5416
ef365f5
45a5416
7617596
0312353
7617596
45a5416
 
 
 
 
 
0312353
 
7617596
 
 
 
3b61cce
ef365f5
45a5416
 
 
19327c9
 
45a5416
ef365f5
45a5416
53b7b42
19327c9
 
45a5416
 
 
 
 
 
53b7b42
a8208b6
53b7b42
 
 
 
 
 
ef365f5
45a5416
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
from __future__ import annotations

import concurrent.futures
import os
import pathlib
import shlex
import shutil
import subprocess
import sys
import hashlib
from typing import Tuple
try:
    import ruamel_yaml as yaml
except ModuleNotFoundError:
    import ruamel.yaml as yaml

import cv2
import torch
from label_prettify import label_prettify

repo_dir = pathlib.Path(__file__).parent
submodule_dir = repo_dir / 'prismer'
sys.path.insert(0, submodule_dir.as_posix())

from dataset import create_dataset, create_loader
from dataset.utils import pre_question
from model.prismer_caption import PrismerCaption
from model.prismer_vqa import PrismerVQA
from model.modules.utils import interpolate_pos_embed


def download_models() -> None:
    if not pathlib.Path('prismer/experts/expert_weights/').exists():
        subprocess.run(shlex.split('python download_checkpoints.py --download_experts=True'), cwd='prismer')

    model_names = [
        'vqa_prismer_base',
        'vqa_prismer_large',
        'pretrain_prismer_base',
        'pretrain_prismer_large',
    ]
    for model_name in model_names:
        if pathlib.Path(f'prismer/logging/{model_name}').exists():
            continue
        subprocess.run(shlex.split(f'python download_checkpoints.py --download_models={model_name}'), cwd='prismer')


def build_deformable_conv() -> None:
    subprocess.run(shlex.split('sh make.sh'), cwd='prismer/experts/segmentation/mask2former/modeling/pixel_decoder/ops')


def run_expert(expert_name: str):
    env = os.environ.copy()
    if 'PYTHONPATH' in env:
        env['PYTHONPATH'] = f'{submodule_dir.as_posix()}:{env["PYTHONPATH"]}'
    else:
        env['PYTHONPATH'] = submodule_dir.as_posix()
    subprocess.run(shlex.split(f'python experts/generate_{expert_name}.py'),
                   cwd='prismer',
                   env=env,
                   check=True)


def compute_md5(image_path: str) -> str:
    with open(image_path, 'rb') as f:
        s = f.read()
    return hashlib.md5(s).hexdigest()


def run_experts(image_path: str) -> Tuple[str, Tuple[str, ...]]:
    im_name = compute_md5(image_path)
    image_dir = submodule_dir / 'helpers' / 'images'
    out_path = image_dir / f'{im_name}.jpg'
    image_dir.mkdir(parents=True, exist_ok=True)
    keys = ['depth', 'edge', 'normal', 'seg_coco', 'obj_detection', 'ocr_detection']
    results = [pathlib.Path('prismer/helpers/labels') / key / f'helpers/images/{im_name}.png' for key in keys]
    results_pretty = [pathlib.Path('prismer/helpers/labels') / key / f'helpers/images/{im_name}_p.png' for key in keys]
    out_paths = tuple(path.as_posix() for path in results)
    pretty_paths = tuple(path.as_posix() for path in results_pretty)

    config = yaml.load(open('prismer/configs/experts.yaml', 'r'), Loader=yaml.Loader)
    config['im_name'] = im_name
    with open('prismer/configs/experts.yaml', 'w') as yaml_file:
        yaml.dump(config, yaml_file, default_flow_style=False)

    if not os.path.exists(out_paths[0]):
        cv2.imwrite(out_path.as_posix(), cv2.imread(image_path))

        # paralleled inference
        expert_names = ['edge', 'normal', 'objdet', 'ocrdet', 'segmentation']
        run_expert('depth')
        with concurrent.futures.ProcessPoolExecutor() as executor:
            executor.map(run_expert, expert_names)
            executor.shutdown(wait=True)

        # no parallelization just to be safe
        # expert_names = ['depth', 'edge', 'normal', 'objdet', 'ocrdet', 'segmentation']
        # for exp in expert_names:
        #     run_expert(exp)

    label_prettify(image_path, out_paths)
    return im_name, pretty_paths


class Model:
    def __init__(self):
        self.config = None
        self.model = None
        self.tokenizer = None
        self.model_name = ''
        self.exp_name = ''
        self.mode = ''

    def set_model(self, exp_name: str, mode: str) -> None:
        if exp_name == self.exp_name and mode == self.mode:
            return

        # load checkpoints
        model_name = exp_name.lower().replace('-', '_')
        if mode == 'caption':
            config = {
                'dataset': 'demo',
                'data_path': 'prismer/helpers',
                'label_path': 'prismer/helpers/labels',
                'experts': ['depth', 'normal', 'seg_coco', 'edge', 'obj_detection', 'ocr_detection'],
                'image_resolution': 480,
                'prismer_model': model_name,
                'freeze': 'freeze_vision',
                'prefix': '',
            }
            model = PrismerCaption(config)
            state_dict = torch.load(f'prismer/logging/pretrain_{model_name}/pytorch_model.bin', map_location='cuda:0')
            state_dict['expert_encoder.positional_embedding'] = interpolate_pos_embed(state_dict['expert_encoder.positional_embedding'],
                                                                                      len(model.expert_encoder.positional_embedding))
        elif mode == 'vqa':
            config = {
                'dataset': 'demo',
                'data_path': 'prismer/helpers',
                'label_path': 'prismer/helpers/labels',
                'experts': ['depth', 'normal', 'seg_coco', 'edge', 'obj_detection', 'ocr_detection'],
                'image_resolution': 480,
                'prismer_model': model_name,
                'freeze': 'freeze_vision',
                'prefix': '',
            }

            model = PrismerVQA(config)
            state_dict = torch.load(f'prismer/logging/vqa_{model_name}/pytorch_model.bin', map_location='cuda:0')
            state_dict['expert_encoder.positional_embedding'] = interpolate_pos_embed(state_dict['expert_encoder.positional_embedding'],
                                                                                      len(model.expert_encoder.positional_embedding))

        model.load_state_dict(state_dict)
        model = model.half()
        model.eval()

        self.config = config
        self.model = model.to('cuda:0')
        self.tokenizer = model.tokenizer
        self.exp_name = exp_name
        self.mode = mode

    @torch.inference_mode()
    def run_caption_model(self, exp_name: str, im_name: str) -> str:
        self.set_model(exp_name, 'caption')
        self.config['im_name'] = im_name
        _, test_dataset = create_dataset('caption', self.config)
        test_loader = create_loader(test_dataset, batch_size=1, num_workers=4, train=False)
        experts, _ = next(iter(test_loader))
        for exp in experts:
            if exp == 'obj_detection':
                experts[exp]['label'] = experts['obj_detection']['label'].to('cuda:0')
                experts[exp]['instance'] = experts['obj_detection']['instance'].to('cuda:0')
            else:
                experts[exp] = experts[exp].to('cuda:0')
        captions = self.model(experts, train=False, prefix=self.config['prefix'])
        captions = self.tokenizer(captions, max_length=30, padding='max_length', return_tensors='pt').input_ids
        caption = captions.to(experts['rgb'].device)[0]
        caption = self.tokenizer.decode(caption, skip_special_tokens=True)
        caption = caption.capitalize() + '.'
        return caption

    def run_caption(self, image_path: str, model_name: str) -> tuple[str | None, ...]:
        im_name, pretty_paths = run_experts(image_path)
        caption = self.run_caption_model(model_name, im_name)
        return caption, *pretty_paths

    @torch.inference_mode()
    def run_vqa_model(self, exp_name: str, im_name: str, question: str) -> str:
        self.set_model(exp_name, 'vqa')
        self.config['im_name'] = im_name
        _, test_dataset = create_dataset('caption', self.config)
        test_loader = create_loader(test_dataset, batch_size=1, num_workers=4, train=False)
        experts, _ = next(iter(test_loader))
        for exp in experts:
            if exp == 'obj_detection':
                experts[exp]['label'] = experts['obj_detection']['label'].to('cuda:0')
                experts[exp]['instance'] = experts['obj_detection']['instance'].to('cuda:0')
            else:
                experts[exp] = experts[exp].to('cuda:0')
        question = pre_question(question)
        answer = self.model(experts, [question], train=False, inference='generate')
        answer = self.tokenizer(answer, max_length=30, padding='max_length', return_tensors='pt').input_ids
        answer = answer.to(experts['rgb'].device)[0]
        answer = self.tokenizer.decode(answer, skip_special_tokens=True)
        answer = answer.capitalize() + '.'
        return answer

    def run_vqa(self, image_path: str, model_name: str, question: str) -> tuple[str | None, ...]:
        im_name, pretty_paths = run_experts(image_path)
        answer = self.run_vqa_model(model_name, im_name, question)
        return answer, *pretty_paths