File size: 6,223 Bytes
10240e0
 
 
 
 
 
 
 
13c1c2e
 
10240e0
 
13c1c2e
10240e0
13c1c2e
 
 
5c74464
3421695
 
 
 
 
10240e0
5c74464
 
 
 
 
 
13c1c2e
5c74464
eabdb1c
10240e0
 
 
13c1c2e
 
 
 
 
 
10240e0
 
 
13c1c2e
 
 
 
10240e0
 
 
 
5c74464
10240e0
5c74464
10240e0
 
 
 
5c74464
eabdb1c
10240e0
 
 
 
 
13c1c2e
10240e0
 
 
 
 
5c74464
ff883a7
10240e0
 
5c74464
10240e0
 
5c74464
10240e0
 
 
 
 
 
 
13c1c2e
10240e0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5c74464
10240e0
 
 
 
 
 
 
 
 
13c1c2e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
from captioner import build_captioner, BaseCaptioner
from segmenter import build_segmenter
from text_refiner import build_text_refiner
import os
import argparse
import pdb
import time
from PIL import Image
import cv2
import numpy as np

class CaptionAnything():
    def __init__(self, args, api_key="", captioner=None, segmenter=None, text_refiner=None):
        self.args = args
        self.captioner = build_captioner(args.captioner, args.device, args) if captioner is None else captioner
        self.segmenter = build_segmenter(args.segmenter, args.device, args) if segmenter is None else segmenter
        
        self.text_refiner = None
        if not args.disable_gpt:
            if text_refiner is not None:
                self.text_refiner = text_refiner
            else:
                self.init_refiner(api_key)

    def init_refiner(self, api_key):
        try:
            self.text_refiner = build_text_refiner(self.args.text_refiner, self.args.device, self.args, api_key)
            self.text_refiner.llm('hi') # test
        except:
            self.text_refiner = None
            print('OpenAI GPT is not available')
    
    def inference(self, image, prompt, controls, disable_gpt=False, enable_wiki=False):
        #  segment with prompt
        print("CA prompt: ", prompt, "CA controls",controls)
        seg_mask = self.segmenter.inference(image, prompt)[0, ...]
        if self.args.enable_morphologyex:
            seg_mask = 255 * seg_mask.astype(np.uint8)
            seg_mask = np.stack([seg_mask, seg_mask, seg_mask], axis = -1)
            seg_mask = cv2.morphologyEx(seg_mask, cv2.MORPH_OPEN, kernel = np.ones((6, 6), np.uint8))
            seg_mask = cv2.morphologyEx(seg_mask, cv2.MORPH_CLOSE, kernel = np.ones((6, 6), np.uint8))
            seg_mask = seg_mask[:,:,0] > 0
        mask_save_path = f'result/mask_{time.time()}.png'
        if not os.path.exists(os.path.dirname(mask_save_path)):
            os.makedirs(os.path.dirname(mask_save_path))
        seg_mask_img = Image.fromarray(seg_mask.astype('int') * 255.)
        if seg_mask_img.mode != 'RGB':
            seg_mask_img = seg_mask_img.convert('RGB')
        seg_mask_img.save(mask_save_path)
        print('seg_mask path: ', mask_save_path)
        print("seg_mask.shape: ", seg_mask.shape)
        #  captioning with mask
        if self.args.enable_reduce_tokens:
            caption, crop_save_path = self.captioner.inference_with_reduced_tokens(image, seg_mask, crop_mode=self.args.seg_crop_mode, filter=self.args.clip_filter, disable_regular_box = self.args.disable_regular_box)
        else:
            caption, crop_save_path = self.captioner.inference_seg(image, seg_mask, crop_mode=self.args.seg_crop_mode, filter=self.args.clip_filter, disable_regular_box = self.args.disable_regular_box)
        #  refining with TextRefiner
        context_captions = []
        if self.args.context_captions:
            context_captions.append(self.captioner.inference(image))
        if not disable_gpt and self.text_refiner is not None:
            refined_caption = self.text_refiner.inference(query=caption, controls=controls, context=context_captions, enable_wiki=enable_wiki)
        else:
            refined_caption = {'raw_caption': caption}                
        out = {'generated_captions': refined_caption,
            'crop_save_path': crop_save_path,
            'mask_save_path': mask_save_path,
            'mask': seg_mask_img,
            'context_captions': context_captions}
        return out
    
def parse_augment():
    parser = argparse.ArgumentParser()
    parser.add_argument('--captioner', type=str, default="blip2")
    parser.add_argument('--segmenter', type=str, default="huge")
    parser.add_argument('--text_refiner', type=str, default="base")
    parser.add_argument('--segmenter_checkpoint', type=str, default="segmenter/sam_vit_h_4b8939.pth")
    parser.add_argument('--seg_crop_mode', type=str, default="wo_bg", choices=['wo_bg', 'w_bg'], help="whether to add or remove background of the image when captioning")
    parser.add_argument('--clip_filter', action="store_true", help="use clip to filter bad captions")
    parser.add_argument('--context_captions', action="store_true", help="use surrounding captions to enhance current caption (TODO)")
    parser.add_argument('--disable_regular_box', action="store_true", default = False, help="crop image with a regular box")
    parser.add_argument('--device', type=str, default="cuda:0")
    parser.add_argument('--port', type=int, default=6086, help="only useful when running gradio applications")  
    parser.add_argument('--debug', action="store_true")
    parser.add_argument('--gradio_share', action="store_true")
    parser.add_argument('--disable_gpt', action="store_true")
    parser.add_argument('--enable_reduce_tokens', action="store_true", default=False)
    parser.add_argument('--disable_reuse_features', action="store_true", default=False)
    parser.add_argument('--enable_morphologyex', action="store_true", default=False)
    args = parser.parse_args()

    if args.debug:
        print(args)
    return args

if __name__ == "__main__":
    args = parse_augment()
    # image_path = 'test_img/img3.jpg'
    image_path = 'test_img/img13.jpg'
    prompts = [
        {
            "prompt_type":["click"],
            "input_point":[[500, 300], [1000, 500]],
            "input_label":[1, 0],
            "multimask_output":"True",
        },
        {
            "prompt_type":["click"],
            "input_point":[[900, 800]],
            "input_label":[1],
            "multimask_output":"True",
        }
    ]
    controls = {
            "length": "30",
            "sentiment": "positive",
            # "imagination": "True",
            "imagination": "False",
            "language": "English",
        }
    
    model = CaptionAnything(args, os.environ['OPENAI_API_KEY'])
    for prompt in prompts:
        print('*'*30)
        print('Image path: ', image_path)
        image = Image.open(image_path)
        print(image)
        print('Visual controls (SAM prompt):\n', prompt)
        print('Language controls:\n', controls)
        out = model.inference(image_path, prompt, controls)