File size: 2,218 Bytes
47a8e90
 
 
 
68135d2
47a8e90
 
 
 
 
 
 
 
 
289bee5
47a8e90
289bee5
f2fd017
68135d2
 
289bee5
68135d2
 
 
f2fd017
289bee5
68135d2
f2fd017
289bee5
47a8e90
 
 
f2fd017
47a8e90
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
289bee5
47a8e90
 
c3538d9
 
 
 
47a8e90
 
344bbb0
47a8e90
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import torch
import gradio as gr
import json
from torchvision import transforms
from torchvision.ops import nms
from PIL import Image, ImageDraw, ImageFont

TORCHSCRIPT_PATH = "res/screenrecognition-web350k-vins.torchscript"
LABELS_PATH = "res/class_map_vins_manual.json"

model = torch.jit.load(TORCHSCRIPT_PATH)

with open(LABELS_PATH, "r") as f:
    idx2Label = json.load(f)["idx2Label"]

img_transforms = transforms.ToTensor()

def inter_class_nms(boxes, scores, labels, iou_threshold=0.5):
    # Perform non-maximum suppression
    keep = nms(boxes, scores, iou_threshold)

    # Filter boxes and scores
    new_boxes = boxes[keep]
    new_scores = scores[keep]
    new_labels = labels[keep]

    # Return the result in a dictionary
    return {'boxes': new_boxes, 'scores': new_scores, 'labels': new_labels}

def predict(img, conf_thresh=0.4):
    img_input = [img_transforms(img)]
    _, pred = model(img_input)
    pred = [inter_class_nms(pred[0]['boxes'], pred[0]['scores'], pred[0]['labels'])]
    out_img = img.copy()
    draw = ImageDraw.Draw(out_img)
    font = ImageFont.truetype("res/Tuffy_Bold.ttf", 25)
    for i in range(len(pred[0]['boxes'])):
        conf_score = pred[0]['scores'][i]
        if conf_score > conf_thresh:
            x1, y1, x2, y2 = pred[0]['boxes'][i]
            x1 = int(x1)
            y1 = int(y1)
            x2 = int(x2)
            y2 = int(y2)
            draw.rectangle([x1, y1, x2, y2], outline='red', width=3)

            text = idx2Label[str(int(pred[0]['labels'][i]))] + " {:.2f}".format(float(conf_score))

            bbox = draw.textbbox((x1, y1), text, font=font)
            draw.rectangle(bbox, fill="red")
            draw.text((x1, y1), text, font=font, fill="black")

    return out_img

example_imgs = [
    ["res/example.jpg", 0.4],
    ["res/screenlane-snapchat-profile.jpg", 0.4],
    ["res/screenlane-snapchat-settings.jpg", 0.4],
	["res/example_pair1.jpg", 0.4],
	["res/example_pair2.jpg", 0.4],
]

interface = gr.Interface(fn=predict, inputs=[gr.Image(type="pil", label="Screenshot"), gr.Slider(0.0, 1.0, step=0.1, value=0.4)], outputs=gr.Image(type="pil", label="Annotated Screenshot").style(height=600), examples=example_imgs)

interface.launch()