fffiloni commited on
Commit
2338cb6
1 Parent(s): 20cfcc2

Delete tasks

Browse files
tasks/__init__.py DELETED
@@ -1,11 +0,0 @@
1
- from .img_cap import image_captioning
2
- from .open_inst import open_instseg
3
- from .open_pano import open_panoseg
4
- from .open_sem import open_semseg
5
- from .ref_cap import referring_captioning
6
- from .ref_in import referring_inpainting
7
- from .ref_seg import referring_segmentation
8
- from .text_ret import text_retrieval
9
- from .reg_ret import region_retrieval
10
- from .ref_in_gpt3 import referring_inpainting_gpt3
11
- from . import img_cap, open_inst, open_pano, open_sem, ref_cap, ref_in, ref_seg, text_ret
 
 
 
 
 
 
 
 
 
 
 
 
tasks/__pycache__/__init__.cpython-38.pyc DELETED
Binary file (713 Bytes)
 
tasks/__pycache__/img_cap.cpython-38.pyc DELETED
Binary file (1.34 kB)
 
tasks/__pycache__/open_inst.cpython-38.pyc DELETED
Binary file (2.25 kB)
 
tasks/__pycache__/open_pano.cpython-38.pyc DELETED
Binary file (2.88 kB)
 
tasks/__pycache__/open_sem.cpython-38.pyc DELETED
Binary file (2.17 kB)
 
tasks/__pycache__/readme.txt DELETED
File without changes
tasks/__pycache__/ref_cap.cpython-38.pyc DELETED
Binary file (2.15 kB)
 
tasks/__pycache__/ref_in.cpython-38.pyc DELETED
Binary file (2.57 kB)
 
tasks/__pycache__/ref_in_gpu3.cpython-38.pyc DELETED
Binary file (3.79 kB)
 
tasks/__pycache__/ref_seg.cpython-38.pyc DELETED
Binary file (1.72 kB)
 
tasks/__pycache__/reg_ret.cpython-38.pyc DELETED
Binary file (2.7 kB)
 
tasks/__pycache__/text_ret.cpython-38.pyc DELETED
Binary file (1.88 kB)
 
tasks/img_cap.py DELETED
@@ -1,54 +0,0 @@
1
- # --------------------------------------------------------
2
- # X-Decoder -- Generalized Decoding for Pixel, Image, and Language
3
- # Copyright (c) 2022 Microsoft
4
- # Licensed under The MIT License [see LICENSE for details]
5
- # Written by Xueyan Zou (xueyan@cs.wisc.edu)
6
- # --------------------------------------------------------
7
-
8
- import cv2
9
- import torch
10
- import numpy as np
11
- from PIL import Image
12
- from torchvision import transforms
13
-
14
-
15
- t = []
16
- t.append(transforms.Resize(224, interpolation=Image.BICUBIC))
17
- transform = transforms.Compose(t)
18
-
19
- t = []
20
- t.append(transforms.Resize(512, interpolation=Image.BICUBIC))
21
- transform_v = transforms.Compose(t)
22
-
23
- def image_captioning(model, image, texts, inpainting_text, *args, **kwargs):
24
- with torch.no_grad():
25
- image_ori = transform_v(image)
26
- width = image_ori.size[0]
27
- height = image_ori.size[1]
28
- image_ori = np.asarray(image_ori)
29
-
30
- image = transform(image)
31
- image = np.asarray(image)
32
- images = torch.from_numpy(image.copy()).permute(2,0,1).cuda()
33
-
34
- batch_inputs = [{'image': images, 'height': height, 'width': width, 'image_id': 0}]
35
- outputs = model.model.evaluate_captioning(batch_inputs)
36
- text = outputs[-1]['captioning_text']
37
-
38
- image_ori = image_ori.copy()
39
- cv2.rectangle(image_ori, (0, height-60), (width, height), (0,0,0), -1)
40
- font = cv2.FONT_HERSHEY_DUPLEX
41
- fontScale = 1.2
42
- thickness = 2
43
- lineType = 2
44
- bottomLeftCornerOfText = (10, height-20)
45
- fontColor = [255,255,255]
46
- cv2.putText(image_ori, text,
47
- bottomLeftCornerOfText,
48
- font,
49
- fontScale,
50
- fontColor,
51
- thickness,
52
- lineType)
53
- torch.cuda.empty_cache()
54
- return Image.fromarray(image_ori), text, None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tasks/open_inst.py DELETED
@@ -1,60 +0,0 @@
1
- # --------------------------------------------------------
2
- # X-Decoder -- Generalized Decoding for Pixel, Image, and Language
3
- # Copyright (c) 2022 Microsoft
4
- # Licensed under The MIT License [see LICENSE for details]
5
- # Written by Xueyan Zou (xueyan@cs.wisc.edu)
6
- # --------------------------------------------------------
7
-
8
- import torch
9
- import numpy as np
10
- from PIL import Image
11
- from torchvision import transforms
12
- from utils.visualizer import Visualizer
13
- from detectron2.utils.colormap import random_color
14
- from detectron2.data import MetadataCatalog
15
- from detectron2.structures import BitMasks
16
-
17
-
18
- t = []
19
- t.append(transforms.Resize(512, interpolation=Image.BICUBIC))
20
- transform = transforms.Compose(t)
21
- metadata = MetadataCatalog.get('ade20k_panoptic_train')
22
-
23
- def open_instseg(model, image, texts, inpainting_text, *args, **kwargs):
24
- thing_classes = [x.strip() for x in texts.split(',')]
25
- thing_colors = [random_color(rgb=True, maximum=255).astype(np.int32).tolist() for _ in range(len(thing_classes))]
26
- thing_dataset_id_to_contiguous_id = {x:x for x in range(len(thing_classes))}
27
-
28
- MetadataCatalog.get("demo").set(
29
- thing_colors=thing_colors,
30
- thing_classes=thing_classes,
31
- thing_dataset_id_to_contiguous_id=thing_dataset_id_to_contiguous_id,
32
- )
33
-
34
- with torch.no_grad():
35
- model.model.sem_seg_head.predictor.lang_encoder.get_text_embeddings(thing_classes + ["background"], is_eval=True)
36
-
37
- metadata = MetadataCatalog.get('demo')
38
- model.model.metadata = metadata
39
- model.model.sem_seg_head.num_classes = len(thing_classes)
40
-
41
- image_ori = transform(image)
42
- width = image_ori.size[0]
43
- height = image_ori.size[1]
44
- image = np.asarray(image_ori)
45
- images = torch.from_numpy(image.copy()).permute(2,0,1).cuda()
46
-
47
- batch_inputs = [{'image': images, 'height': height, 'width': width}]
48
- outputs = model.forward(batch_inputs)
49
- visual = Visualizer(image_ori, metadata=metadata)
50
-
51
- inst_seg = outputs[-1]['instances']
52
- inst_seg.pred_masks = inst_seg.pred_masks.cpu()
53
- inst_seg.pred_boxes = BitMasks(inst_seg.pred_masks > 0).get_bounding_boxes()
54
- demo = visual.draw_instance_predictions(inst_seg) # rgb Image
55
- res = demo.get_image()
56
-
57
-
58
- MetadataCatalog.remove('demo')
59
- torch.cuda.empty_cache()
60
- return Image.fromarray(res), '', None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tasks/open_pano.py DELETED
@@ -1,70 +0,0 @@
1
- # --------------------------------------------------------
2
- # X-Decoder -- Generalized Decoding for Pixel, Image, and Language
3
- # Copyright (c) 2022 Microsoft
4
- # Licensed under The MIT License [see LICENSE for details]
5
- # Written by Xueyan Zou (xueyan@cs.wisc.edu)
6
- # --------------------------------------------------------
7
-
8
- import torch
9
- import numpy as np
10
- from PIL import Image
11
- from torchvision import transforms
12
- from utils.visualizer import Visualizer
13
- from detectron2.utils.colormap import random_color
14
- from detectron2.data import MetadataCatalog
15
-
16
-
17
- t = []
18
- t.append(transforms.Resize(512, interpolation=Image.BICUBIC))
19
- transform = transforms.Compose(t)
20
- metadata = MetadataCatalog.get('ade20k_panoptic_train')
21
-
22
- def open_panoseg(model, image, texts, inpainting_text, *args, **kwargs):
23
- stuff_classes = [x.strip() for x in texts.split(';')[0].replace('stuff:','').split(',')]
24
- thing_classes = [x.strip() for x in texts.split(';')[1].replace('thing:','').split(',')]
25
- thing_colors = [random_color(rgb=True, maximum=255).astype(np.int32).tolist() for _ in range(len(thing_classes))]
26
- stuff_colors = [random_color(rgb=True, maximum=255).astype(np.int32).tolist() for _ in range(len(stuff_classes))]
27
- thing_dataset_id_to_contiguous_id = {x:x for x in range(len(thing_classes))}
28
- stuff_dataset_id_to_contiguous_id = {x+len(thing_classes):x for x in range(len(stuff_classes))}
29
-
30
- MetadataCatalog.get("demo").set(
31
- thing_colors=thing_colors,
32
- thing_classes=thing_classes,
33
- thing_dataset_id_to_contiguous_id=thing_dataset_id_to_contiguous_id,
34
- stuff_colors=stuff_colors,
35
- stuff_classes=stuff_classes,
36
- stuff_dataset_id_to_contiguous_id=stuff_dataset_id_to_contiguous_id,
37
- )
38
- model.model.sem_seg_head.predictor.lang_encoder.get_text_embeddings(thing_classes + stuff_classes + ["background"], is_eval=True)
39
- metadata = MetadataCatalog.get('demo')
40
- model.model.metadata = metadata
41
- model.model.sem_seg_head.num_classes = len(thing_classes + stuff_classes)
42
-
43
- with torch.no_grad():
44
- image_ori = transform(image)
45
- width = image_ori.size[0]
46
- height = image_ori.size[1]
47
- image = transform(image_ori)
48
- image = np.asarray(image)
49
- images = torch.from_numpy(image.copy()).permute(2,0,1).cuda()
50
-
51
- batch_inputs = [{'image': images, 'height': height, 'width': width}]
52
- outputs = model.forward(batch_inputs)
53
- visual = Visualizer(image_ori, metadata=metadata)
54
-
55
- pano_seg = outputs[-1]['panoptic_seg'][0]
56
- pano_seg_info = outputs[-1]['panoptic_seg'][1]
57
-
58
- for i in range(len(pano_seg_info)):
59
- if pano_seg_info[i]['category_id'] in metadata.thing_dataset_id_to_contiguous_id.keys():
60
- pano_seg_info[i]['category_id'] = metadata.thing_dataset_id_to_contiguous_id[pano_seg_info[i]['category_id']]
61
- else:
62
- pano_seg_info[i]['isthing'] = False
63
- pano_seg_info[i]['category_id'] = metadata.stuff_dataset_id_to_contiguous_id[pano_seg_info[i]['category_id']]
64
-
65
- demo = visual.draw_panoptic_seg(pano_seg.cpu(), pano_seg_info) # rgb Image
66
- res = demo.get_image()
67
-
68
- MetadataCatalog.remove('demo')
69
- torch.cuda.empty_cache()
70
- return Image.fromarray(res), '', None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tasks/open_sem.py DELETED
@@ -1,57 +0,0 @@
1
- # --------------------------------------------------------
2
- # X-Decoder -- Generalized Decoding for Pixel, Image, and Language
3
- # Copyright (c) 2022 Microsoft
4
- # Licensed under The MIT License [see LICENSE for details]
5
- # Written by Xueyan Zou (xueyan@cs.wisc.edu)
6
- # --------------------------------------------------------
7
-
8
- import os
9
- import cv2
10
- import torch
11
- import numpy as np
12
- from PIL import Image
13
- from torchvision import transforms
14
- from utils.visualizer import Visualizer
15
- from detectron2.utils.colormap import random_color
16
- from detectron2.data import MetadataCatalog
17
-
18
-
19
- t = []
20
- t.append(transforms.Resize(512, interpolation=Image.BICUBIC))
21
- transform = transforms.Compose(t)
22
- metadata = MetadataCatalog.get('ade20k_panoptic_train')
23
-
24
- def open_semseg(model, image, texts, inpainting_text, *args, **kwargs):
25
- stuff_classes = [x.strip() for x in texts.split(',')]
26
- stuff_colors = [random_color(rgb=True, maximum=255).astype(np.int32).tolist() for _ in range(len(stuff_classes))]
27
- stuff_dataset_id_to_contiguous_id = {x:x for x in range(len(stuff_classes))}
28
-
29
- MetadataCatalog.get("demo").set(
30
- stuff_colors=stuff_colors,
31
- stuff_classes=stuff_classes,
32
- stuff_dataset_id_to_contiguous_id=stuff_dataset_id_to_contiguous_id,
33
- )
34
- model.model.sem_seg_head.predictor.lang_encoder.get_text_embeddings(stuff_classes + ["background"], is_eval=True)
35
- metadata = MetadataCatalog.get('demo')
36
- model.model.metadata = metadata
37
- model.model.sem_seg_head.num_classes = len(stuff_classes)
38
-
39
- with torch.no_grad():
40
- image_ori = transform(image)
41
- width = image_ori.size[0]
42
- height = image_ori.size[1]
43
- image = transform(image_ori)
44
- image = np.asarray(image)
45
- images = torch.from_numpy(image.copy()).permute(2,0,1).cuda()
46
-
47
- batch_inputs = [{'image': images, 'height': height, 'width': width}]
48
- outputs = model.forward(batch_inputs)
49
- visual = Visualizer(image_ori, metadata=metadata)
50
-
51
- sem_seg = outputs[-1]['sem_seg'].max(0)[1]
52
- demo = visual.draw_sem_seg(sem_seg.cpu(), alpha=0.5) # rgb Image
53
- res = demo.get_image()
54
-
55
- MetadataCatalog.remove('demo')
56
- torch.cuda.empty_cache()
57
- return Image.fromarray(res), '', None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tasks/ref_cap.py DELETED
@@ -1,68 +0,0 @@
1
- # --------------------------------------------------------
2
- # X-Decoder -- Generalized Decoding for Pixel, Image, and Language
3
- # Copyright (c) 2022 Microsoft
4
- # Licensed under The MIT License [see LICENSE for details]
5
- # Written by Xueyan Zou (xueyan@cs.wisc.edu)
6
- # --------------------------------------------------------
7
-
8
- import torch
9
- import torch.nn.functional as F
10
- import numpy as np
11
- from PIL import Image
12
- from torchvision import transforms
13
- from utils.visualizer import Visualizer
14
- from detectron2.data import MetadataCatalog
15
-
16
- t = []
17
- t.append(transforms.Resize(224, interpolation=Image.BICUBIC))
18
- transform_ret = transforms.Compose(t)
19
- t = []
20
- t.append(transforms.Resize(512, interpolation=Image.BICUBIC))
21
- transform_grd = transforms.Compose(t)
22
-
23
- metedata = MetadataCatalog.get('coco_2017_train_panoptic')
24
-
25
- def referring_captioning(model, image, texts, inpainting_text, *args, **kwargs):
26
- model_last, model_cap = model
27
- with torch.no_grad():
28
- image_ori = image
29
- image = transform_grd(image)
30
- width = image.size[0]
31
- height = image.size[1]
32
- image = np.asarray(image)
33
- image_ori_ = image
34
- images = torch.from_numpy(image.copy()).permute(2,0,1).cuda()
35
- texts_input = [[texts.strip() if texts.endswith('.') else (texts + '.')]]
36
-
37
- batch_inputs = [{'image': images, 'groundings': {'texts':texts_input}, 'height': height, 'width': width}]
38
- outputs = model_last.model.evaluate_grounding(batch_inputs, None)
39
-
40
- grd_mask = (outputs[-1]['grounding_mask'] > 0).float()
41
- grd_mask_ = (1 - F.interpolate(grd_mask[None,], (224, 224), mode='nearest')[0]).bool()
42
-
43
- color = [252/255, 91/255, 129/255]
44
- visual = Visualizer(image_ori_, metadata=metedata)
45
- demo = visual.draw_binary_mask(grd_mask.cpu().numpy()[0], color=color, text=texts)
46
- res = demo.get_image()
47
-
48
- if (1 - grd_mask_.float()).sum() < 5:
49
- torch.cuda.empty_cache()
50
- return Image.fromarray(res), 'n/a', None
51
-
52
- grd_mask_ = grd_mask_ * 0
53
- image = transform_ret(image_ori)
54
- image_ori = np.asarray(image_ori)
55
- image = np.asarray(image)
56
- images = torch.from_numpy(image.copy()).permute(2,0,1).cuda()
57
- batch_inputs = [{'image': images, 'image_id': 0, 'captioning_mask': grd_mask_}]
58
-
59
- token_text = texts.replace('.','') if texts.endswith('.') else texts
60
- token = model_cap.model.sem_seg_head.predictor.lang_encoder.tokenizer.encode(token_text)
61
- token = torch.tensor(token)[None,:-1]
62
-
63
- outputs = model_cap.model.evaluate_captioning(batch_inputs, extra={'token': token})
64
- # outputs = model_cap.model.evaluate_captioning(batch_inputs, extra={})
65
- text = outputs[-1]['captioning_text']
66
-
67
- torch.cuda.empty_cache()
68
- return Image.fromarray(res), text, None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tasks/ref_in.py DELETED
@@ -1,77 +0,0 @@
1
- # --------------------------------------------------------
2
- # X-Decoder -- Generalized Decoding for Pixel, Image, and Language
3
- # Copyright (c) 2022 Microsoft
4
- # Licensed under The MIT License [see LICENSE for details]
5
- # Written by Jianwei Yang (jianwyan@microsoft.com), Xueyan Zou (xueyan@cs.wisc.edu)
6
- # --------------------------------------------------------
7
-
8
- import torch
9
- import numpy as np
10
- from PIL import Image
11
- from utils.inpainting import pad_image
12
- from torchvision import transforms
13
- from utils.visualizer import Visualizer
14
- from diffusers import StableDiffusionInpaintPipeline
15
- from detectron2.utils.colormap import random_color
16
- from detectron2.data import MetadataCatalog
17
- from scipy import ndimage
18
-
19
-
20
- t = []
21
- t.append(transforms.Resize(512, interpolation=Image.BICUBIC))
22
- transform = transforms.Compose(t)
23
- metadata = MetadataCatalog.get('ade20k_panoptic_train')
24
-
25
- pipe = StableDiffusionInpaintPipeline.from_pretrained(
26
- # "stabilityai/stable-diffusion-2-inpainting",
27
- "runwayml/stable-diffusion-inpainting",
28
- revision="fp16",
29
- torch_dtype=torch.float16,
30
- ).to("cuda")
31
-
32
- def crop_image(input_image):
33
- crop_w, crop_h = np.floor(np.array(input_image.size) / 64).astype(int) * 64
34
- im_cropped = Image.fromarray(np.array(input_image)[:crop_h, :crop_w])
35
- return im_cropped
36
-
37
- def referring_inpainting(model, image, texts, inpainting_text, *args, **kwargs):
38
- model.model.metadata = metadata
39
- texts = [[texts if texts.strip().endswith('.') else (texts.strip() + '.')]]
40
- image_ori = crop_image(transform(image))
41
-
42
- with torch.no_grad():
43
- width = image_ori.size[0]
44
- height = image_ori.size[1]
45
- image = np.asarray(image_ori)
46
- image_ori_np = np.asarray(image_ori)
47
- images = torch.from_numpy(image.copy()).permute(2,0,1).cuda()
48
-
49
- batch_inputs = [{'image': images, 'height': height, 'width': width, 'groundings': {'texts': texts}}]
50
- outputs = model.model.evaluate_grounding(batch_inputs, None)
51
- visual = Visualizer(image_ori_np, metadata=metadata)
52
-
53
- grd_mask = (outputs[0]['grounding_mask'] > 0).float().cpu().numpy()
54
- for idx, mask in enumerate(grd_mask):
55
- color = random_color(rgb=True, maximum=1).astype(np.int32).tolist()
56
- demo = visual.draw_binary_mask(mask, color=color, text=texts[idx])
57
- res = demo.get_image()
58
-
59
- if inpainting_text not in ['no', '']:
60
- # if we want to do inpainting
61
- image_crop = image_ori
62
- struct2 = ndimage.generate_binary_structure(2, 2)
63
- mask_dilated = ndimage.binary_dilation(grd_mask[0], structure=struct2, iterations=3).astype(grd_mask[0].dtype)
64
- mask = Image.fromarray(mask_dilated * 255).convert('RGB')
65
- image_and_mask = {
66
- "image": image_crop,
67
- "mask": mask,
68
- }
69
- width = image_crop.size[0]; height = image_crop.size[1]
70
- images_inpainting = pipe(prompt = inpainting_text.strip(), image=image_and_mask['image'], mask_image=image_and_mask['mask'], height=height, width=width).images[0]
71
- # put images_inpainting back to original image
72
- # image_ori.paste(images_inpainting)
73
- torch.cuda.empty_cache()
74
- return Image.fromarray(res) ,'' , images_inpainting
75
- else:
76
- torch.cuda.empty_cache()
77
- return image_ori, 'text', Image.fromarray(res)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tasks/ref_in_gpt3.py DELETED
@@ -1,109 +0,0 @@
1
- # --------------------------------------------------------
2
- # X-Decoder -- Generalized Decoding for Pixel, Image, and Language
3
- # Copyright (c) 2022 Microsoft
4
- # Licensed under The MIT License [see LICENSE for details]
5
- # Written by Jianwei Yang (jianwyan@microsoft.com)
6
- # --------------------------------------------------------
7
- import os
8
- import openai
9
- import torch
10
- import numpy as np
11
- from scipy import ndimage
12
- from PIL import Image
13
- from utils.inpainting import pad_image, crop_image
14
- from torchvision import transforms
15
- from utils.visualizer import Visualizer
16
- from diffusers import StableDiffusionInpaintPipeline
17
- from detectron2.utils.colormap import random_color
18
- from detectron2.data import MetadataCatalog
19
-
20
-
21
- t = []
22
- t.append(transforms.Resize(512, interpolation=Image.BICUBIC))
23
- transform = transforms.Compose(t)
24
- metadata = MetadataCatalog.get('ade20k_panoptic_train')
25
-
26
- pipe = StableDiffusionInpaintPipeline.from_pretrained(
27
- # "stabilityai/stable-diffusion-2-inpainting",
28
- "runwayml/stable-diffusion-inpainting",
29
- revision="fp16",
30
- torch_dtype=torch.float16,
31
- ).to("cuda")
32
-
33
- prompts = []
34
- prompts.append("instruction: remove the person, task: (referring editing), source: [person], target:<clean and empty scene>.")
35
- prompts.append("instruction: remove the person in the middle, task: (referring editing), source: [person in the middle], target:<clean and empty scene>.")
36
- prompts.append("instruction: remove the dog on the left side, task: (referring editing), source: [dog on the left side], target:<clean and empty scene>.")
37
- prompts.append("instruction: change the apple to a pear, task: (referring editing), source: [apple], target: <pear>.")
38
- prompts.append("instruction: change the red apple to a green one, task: (referring editing), source: [red apple], target: <green apple>.")
39
- prompts.append("instruction: change the color of bird's feathers from white to blue, task: (referring editing), source: [white bird], target: <blue bird>.")
40
- prompts.append("instruction: replace the dog with a cat, task: (referring editing), source: [dot], target: <cat>.")
41
- prompts.append("instruction: replace the red apple with a green one, task: (referring editing), source: [red apple], target: <green apple>.")
42
-
43
- #openai.api_type = "azure"
44
- #openai.api_base = "https://xdecoder.openai.azure.com/"
45
- #openai.api_version = "2022-12-01"
46
- openai.organization = os.environ["OPENAI_ORG"]
47
- openai.api_key = os.environ["OPENAI_API_KEY"]
48
-
49
- def get_gpt3_response(prompt):
50
- response = openai.Completion.create(
51
- model="text-davinci-003",
52
- prompt=prompt,
53
- temperature=0.7,
54
- max_tokens=512,
55
- top_p=1,
56
- frequency_penalty=0,
57
- presence_penalty=0,
58
- )
59
-
60
- return response
61
-
62
- def referring_inpainting_gpt3(model, image, instruction, *args, **kwargs):
63
- # convert instruction to source and target
64
- instruction = instruction.replace('.', '')
65
- print(instruction)
66
- resp = get_gpt3_response(' '.join(prompts) + ' instruction: ' + instruction + ',')
67
- resp_text = resp['choices'][0]['text']
68
- print(resp_text)
69
- ref_text = resp_text[resp_text.find('[')+1:resp_text.find(']')]
70
- inp_text = resp_text[resp_text.find('<')+1:resp_text.find('>')]
71
-
72
- model.model.metadata = metadata
73
- texts = [[ref_text if ref_text.strip().endswith('.') else (ref_text.strip() + '.')]]
74
- image_ori = crop_image(transform(image))
75
-
76
- with torch.no_grad():
77
- width = image_ori.size[0]
78
- height = image_ori.size[1]
79
- image = np.asarray(image_ori)
80
- image_ori_np = np.asarray(image_ori)
81
- images = torch.from_numpy(image.copy()).permute(2,0,1).cuda()
82
-
83
- batch_inputs = [{'image': images, 'height': height, 'width': width, 'groundings': {'texts': texts}}]
84
- outputs = model.model.evaluate_grounding(batch_inputs, None)
85
- visual = Visualizer(image_ori_np, metadata=metadata)
86
-
87
- grd_mask = (outputs[0]['grounding_mask'] > 0).float().cpu().numpy()
88
- for idx, mask in enumerate(grd_mask):
89
- color = random_color(rgb=True, maximum=1).astype(np.int32).tolist()
90
- demo = visual.draw_binary_mask(mask, color=color, text=texts[idx])
91
- res = demo.get_image()
92
-
93
- if inp_text not in ['no', '']:
94
- image_crop = image_ori
95
- struct2 = ndimage.generate_binary_structure(2, 2)
96
- mask_dilated = ndimage.binary_dilation(grd_mask[0], structure=struct2, iterations=3).astype(grd_mask[0].dtype)
97
- mask = Image.fromarray(mask_dilated * 255).convert('RGB')
98
- image_and_mask = {
99
- "image": image_crop,
100
- "mask": mask,
101
- }
102
- # images_inpainting = inpainting(inpainting_model, image_and_mask, inp_text, ddim_steps, num_samples, scale, seed)
103
- width = image_ori.size[0]; height = image_ori.size[1]
104
- images_inpainting = pipe(prompt = inp_text.strip(), image=image_and_mask['image'], mask_image=image_and_mask['mask'], height=height, width=width).images
105
- torch.cuda.empty_cache()
106
- return images_inpainting[0]
107
- else:
108
- torch.cuda.empty_cache()
109
- return Image.fromarray(res)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tasks/ref_seg.py DELETED
@@ -1,46 +0,0 @@
1
- # --------------------------------------------------------
2
- # X-Decoder -- Generalized Decoding for Pixel, Image, and Language
3
- # Copyright (c) 2022 Microsoft
4
- # Licensed under The MIT License [see LICENSE for details]
5
- # Written by Xueyan Zou (xueyan@cs.wisc.edu)
6
- # --------------------------------------------------------
7
-
8
- import torch
9
- import numpy as np
10
- from PIL import Image
11
- from torchvision import transforms
12
- from utils.visualizer import Visualizer
13
- from detectron2.utils.colormap import random_color
14
- from detectron2.data import MetadataCatalog
15
-
16
-
17
- t = []
18
- t.append(transforms.Resize(512, interpolation=Image.BICUBIC))
19
- transform = transforms.Compose(t)
20
- metadata = MetadataCatalog.get('ade20k_panoptic_train')
21
-
22
- def referring_segmentation(model, image, texts, inpainting_text, *args, **kwargs):
23
- model.model.metadata = metadata
24
- texts = texts.strip()
25
- texts = [[text.strip() if text.endswith('.') else (text + '.')] for text in texts.split(',')]
26
- image_ori = transform(image)
27
-
28
- with torch.no_grad():
29
- width = image_ori.size[0]
30
- height = image_ori.size[1]
31
- image = np.asarray(image_ori)
32
- image_ori_np = np.asarray(image_ori)
33
- images = torch.from_numpy(image.copy()).permute(2,0,1).cuda()
34
-
35
- batch_inputs = [{'image': images, 'height': height, 'width': width, 'groundings': {'texts': texts}}]
36
- outputs = model.model.evaluate_grounding(batch_inputs, None)
37
- visual = Visualizer(image_ori_np, metadata=metadata)
38
-
39
- grd_mask = (outputs[0]['grounding_mask'] > 0).float().cpu().numpy()
40
- for idx, mask in enumerate(grd_mask):
41
- color = random_color(rgb=True, maximum=1).astype(np.int32).tolist()
42
- demo = visual.draw_binary_mask(mask, color=color, text=texts[idx])
43
- res = demo.get_image()
44
-
45
- torch.cuda.empty_cache()
46
- return Image.fromarray(res), '', None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tasks/reg_ret.py DELETED
@@ -1,72 +0,0 @@
1
- # --------------------------------------------------------
2
- # X-Decoder -- Generalized Decoding for Pixel, Image, and Language
3
- # Copyright (c) 2022 Microsoft
4
- # Licensed under The MIT License [see LICENSE for details]
5
- # Written by Xueyan Zou (xueyan@cs.wisc.edu)
6
- # --------------------------------------------------------
7
-
8
- import glob
9
- import os
10
- import torch
11
- import numpy as np
12
- from PIL import Image
13
- from torchvision import transforms
14
- from detectron2.data import MetadataCatalog
15
- from utils.visualizer import Visualizer
16
- from xdecoder.language.loss import vl_similarity
17
- from detectron2.utils.colormap import random_color
18
-
19
-
20
- t = []
21
- t.append(transforms.Resize((224,224), interpolation=Image.BICUBIC))
22
- transform_ret = transforms.Compose(t)
23
- t = []
24
- t.append(transforms.Resize(512, interpolation=Image.BICUBIC))
25
- transform_grd = transforms.Compose(t)
26
- metadata = MetadataCatalog.get('coco_2017_train_panoptic')
27
-
28
- imgs_root = 'images/coco'
29
- img_pths = sorted(glob.glob(os.path.join(imgs_root, '*.jpg')))
30
- imgs = [Image.open(x).convert('RGB') for x in img_pths]
31
- v_emb = torch.load("v_emb.da")
32
-
33
- def region_retrieval(model, image, texts, inpainting_text, *args, **kwargs):
34
- model_novg, model_seg = model
35
- with torch.no_grad():
36
- # images = [transform_ret(x) for x in imgs]
37
- # images = [np.asarray(x) for x in imgs]
38
- # images = [torch.from_numpy(x.copy()).permute(2,0,1).cuda() for x in images]
39
- # batch_inputs = [{'image': image, 'image_id': 0} for image in images]
40
- # outputs = model_novg.model.evaluate(batch_inputs)
41
- # v_emb = torch.cat([x['captions'][-1:] for x in outputs])
42
- # v_emb = v_emb / (v_emb.norm(dim=-1, keepdim=True) + 1e-7)
43
- # torch.save(v_emb, "v_emb.da")
44
- # exit()
45
-
46
- texts_ = [[x.strip() if x.strip().endswith('.') else (x.strip() + '.')] for x in texts.split(',')]
47
- model_novg.model.sem_seg_head.predictor.lang_encoder.get_text_embeddings(texts_, is_eval=False, name='caption', prompt=False)
48
- t_emb = getattr(model_novg.model.sem_seg_head.predictor.lang_encoder, '{}_text_embeddings'.format('caption'))
49
- temperature = model_novg.model.sem_seg_head.predictor.lang_encoder.logit_scale
50
-
51
- logits = vl_similarity(v_emb, t_emb, temperature)
52
- prob, idx = logits[:,0].softmax(-1).max(0)
53
- image_ori = imgs[idx]
54
- image = transform_grd(image_ori)
55
- width, height = image.size
56
- image = np.asarray(image)
57
- image_ori = np.asarray(image)
58
- images = torch.from_numpy(image.copy()).permute(2,0,1).cuda()
59
- batch_inputs = [{'image': images, 'height': height, 'width': width, 'groundings': {'texts': texts_}}]
60
- model_seg.model.sem_seg_head.predictor.lang_encoder.get_text_embeddings(texts_, is_eval=False, name='caption', prompt=False)
61
- outputs = model_seg.model.evaluate_grounding(batch_inputs, None)
62
-
63
- visual = Visualizer(image_ori, metadata=metadata)
64
- grd_masks = (outputs[0]['grounding_mask'] > 0).float().cpu().numpy()
65
-
66
- for text, mask in zip([x[0] for x in texts_], grd_masks):
67
- color = random_color(rgb=True, maximum=1).astype(np.int32).tolist()
68
- demo = visual.draw_binary_mask(mask, color=color, text=texts, alpha=0.5)
69
- res = demo.get_image()
70
-
71
- torch.cuda.empty_cache()
72
- return Image.fromarray(res), "Selected Image Probability: {:.2f}".format(prob.item()), None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tasks/text_ret.py DELETED
@@ -1,46 +0,0 @@
1
- # --------------------------------------------------------
2
- # X-Decoder -- Generalized Decoding for Pixel, Image, and Language
3
- # Copyright (c) 2022 Microsoft
4
- # Licensed under The MIT License [see LICENSE for details]
5
- # Written by Xueyan Zou (xueyan@cs.wisc.edu)
6
- # --------------------------------------------------------
7
-
8
- import torch
9
- import numpy as np
10
- from PIL import Image
11
- from torchvision import transforms
12
- from detectron2.data import MetadataCatalog
13
- from xdecoder.language.loss import vl_similarity
14
-
15
-
16
- t = []
17
- t.append(transforms.Resize(224, interpolation=Image.BICUBIC))
18
- transform_ret = transforms.Compose(t)
19
- t = []
20
- t.append(transforms.Resize(512, interpolation=Image.BICUBIC))
21
- transform_grd = transforms.Compose(t)
22
-
23
- metedata = MetadataCatalog.get('coco_2017_train_panoptic')
24
-
25
- def text_retrieval(model, image, texts, inpainting_text, *args, **kwargs):
26
- out_str = ''
27
- with torch.no_grad():
28
- image = transform_ret(image)
29
- image = np.asarray(image)
30
- images = torch.from_numpy(image.copy()).permute(2,0,1).cuda()
31
- batch_inputs = [{'image': images, 'image_id': 0}]
32
- outputs = model.model.evaluate(batch_inputs)
33
- v_emb = torch.cat([x['captions'][-1:] for x in outputs])
34
- v_emb = v_emb / (v_emb.norm(dim=-1, keepdim=True) + 1e-7)
35
-
36
- texts = [x.strip() for x in texts.split(',')]
37
- model.model.sem_seg_head.predictor.lang_encoder.get_text_embeddings(texts, is_eval=False, name='caption', prompt=False)
38
- t_emb = getattr(model.model.sem_seg_head.predictor.lang_encoder, '{}_text_embeddings'.format('caption'))
39
- temperature = model.model.sem_seg_head.predictor.lang_encoder.logit_scale
40
- logits = vl_similarity(v_emb, t_emb, temperature)
41
- topk_prob, topk_idx = logits.softmax(-1)[0].topk(min(5, len(texts)))
42
-
43
- for prob, idx in zip(topk_prob, topk_idx):
44
- out_str += "{}:{:.2f}; ".format(texts[idx.item()], prob.item())
45
- torch.cuda.empty_cache()
46
- return None, out_str, None