File size: 4,015 Bytes
c184c0e
d2ff88f
c184c0e
29cef8e
d2ff88f
29cef8e
 
 
 
 
 
 
 
74b0d5e
29cef8e
 
d2ff88f
 
 
 
 
 
74b0d5e
 
 
d2ff88f
 
35301df
d2ff88f
 
 
b957ec1
d2ff88f
 
 
 
 
 
 
29cef8e
 
74b0d5e
 
 
 
 
 
29cef8e
74b0d5e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
from models.builder import build_model
from visualization import mask2rgb
from segmentation.datasets import PascalVOCDataset

import os
from hydra import compose, initialize
from PIL import Image
import matplotlib.pyplot as plt
from torchvision import transforms as T
import torch.nn.functional as F
import numpy as np
from operator import itemgetter 
import torch
import random
import warnings

warnings.filterwarnings("ignore")
initialize(config_path="configs", version_base=None)

from huggingface_hub import Repository

repo = Repository(
    local_dir="clip-dinoiser",
    clone_from="ariG23498/clip-dinoiser",
    use_auth_token=os.environ.get("token")
)

check_path = 'clip-dinoiser/checkpoints/last.pt'
device = "cuda" if torch.cuda.is_available() else "cpu"

check = torch.load(check_path, map_location=device)
dinoclip_cfg = "clip_dinoiser.yaml"
cfg = compose(config_name=dinoclip_cfg)

model = build_model(cfg.model, class_names=PascalVOCDataset.CLASSES).to(device)
model.clip_backbone.decode_head.use_templates=False # switching off the imagenet templates for fast inference
model.load_state_dict(check['model_state_dict'], strict=False)
model = model.eval()

import gradio as gr

def run_clip_dinoiser(input_image, text_prompts):
    image = input_image.convert("RGB")
    text_prompts = text_prompts.split(",")
    palette = [
        (random.randint(0, 256), random.randint(0, 256), random.randint(0, 256)) for _ in range(len(text_prompts))
    ]

    model.clip_backbone.decode_head.update_vocab(text_prompts)
    model.to(device)
    model.apply_found = True

    img_tens = T.PILToTensor()(image).unsqueeze(0).to(device) / 255.

    h, w = img_tens.shape[-2:]
    output = model(img_tens).cpu()
    output = F.interpolate(output, scale_factor=model.clip_backbone.backbone.patch_size, mode="bilinear",
                        align_corners=False)[..., :h, :w]
    output = output[0].argmax(dim=0)
    mask = mask2rgb(output, palette)

    # fig = plt.figure(figsize=(3, 1))
    # classes = np.unique(output).tolist()
    # plt.imshow(np.array(itemgetter(*classes)(palette)).reshape(1, -1, 3))
    # plt.xticks(np.arange(len(classes)), list(itemgetter(*classes)(text_prompts)), rotation=45)
    # plt.yticks([])

    
    # fig, ax = plt.subplots(nrows=1, ncols=2)
    # alpha=0.5
    # blend = (alpha)*np.array(image)/255. + (1-alpha) * mask/255.
    # ax[0].imshow(blend)
    # ax[1].imshow(mask)
    # ax[0].axis('off')
    # ax[1].axis('off')

    classes = np.unique(output).tolist()
    palette_array = np.array(itemgetter(*classes)(palette)).reshape(1, -1, 3)
    alpha=0.5
    blend = (alpha)*np.array(image)/255. + (1-alpha) * mask/255.
    return palette_array, blend, mask


if __name__ == "__main__":

    block = gr.Blocks().queue()
    with block:
        gr.Markdown("<h1><center>CLIP-DINOiser<h1><center>")

        with gr.Row():
            with gr.Column():
                input_image = gr.Image(source='upload', type="pil")
                text_prompts = gr.Textbox(label="Enter comma-separated prompts")
                run_button = gr.Button(label="Run")

            with gr.Column():
                palette_array = gr.outputs.Image(
                    type="numpy",
                )
                with gr.Row():
                    overlay_mask = gr.outputs.Image(
                        type="numpy",
                    )
                    only_mask = gr.outputs.Image(
                        type="numpy",
                    )

        run_button.click(
            fn=run_clip_dinoiser,
            inputs=[input_image, text_prompts,],
            outputs=[overlay_mask, only_mask]
        )
        gr.Examples(
          [["vintage_bike.jpeg", "background, vintage bike, leather bag"]],
          inputs = [input_image, text_prompts,],
          outputs = [overlay_mask, only_mask],
          fn=run_clip_dinoiser,
          cache_examples=True,
          label='Try this example input!'
      )
    block.launch(share=False, show_api=False, show_error=True)