File size: 1,419 Bytes
550b53d
 
 
 
 
 
 
 
 
 
 
 
 
c30cdc1
f995acb
 
550b53d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f995acb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import clip
import torch
import gradio as gr
import torchvision.transforms as T
from PIL import Image
try:
    from torchvision.transforms import InterpolationMode
    BICUBIC = InterpolationMode.BICUBIC
except ImportError:
    BICUBIC = Image.BICUBIC
import warnings
warnings.filterwarnings("ignore")

device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load('ViT-L/14@336px')
model.to(device)


def zeroshot_detection(Press_Clear_Dont_Stack_Image):
    inp = Press_Clear_Dont_Stack_Image

    captions = "photo of a guardrail, no guardrail in the photo"   #CHANGE THIS IF YOU WANT TO CHANGE THE PREDICTION: separate by commas

    captions = captions.split(',')
    caption = clip.tokenize(captions).to(device)
    image = preprocess(inp).unsqueeze(0).to(device)
    with torch.no_grad():
        image_features = model.encode_image(image)
        text_features = model.encode_text(caption)
    image_features /= image_features.norm(dim=-1, keepdim=True)
    text_features /= text_features.norm(dim=-1, keepdim=True)
    similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)
    values, indices = similarity[0].topk(len(captions))
    return {captions[indices[i].item()]: float(values[i].item()) for i in range(len(values))}   

gr.Interface(fn=zeroshot_detection, 
             inputs=[gr.Image(type="pil")],
             outputs=gr.Label(num_top_classes=1)).launch()