File size: 4,072 Bytes
47a8e90
 
 
 
 
 
 
 
 
 
 
 
 
289bee5
47a8e90
289bee5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47a8e90
 
 
289bee5
47a8e90
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
289bee5
47a8e90
 
c3538d9
 
 
 
47a8e90
 
344bbb0
47a8e90
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import torch
import gradio as gr
import json
from torchvision import transforms
from PIL import Image, ImageDraw, ImageFont

TORCHSCRIPT_PATH = "res/screenrecognition-web350k-vins.torchscript"
LABELS_PATH = "res/class_map_vins_manual.json"

model = torch.jit.load(TORCHSCRIPT_PATH)

with open(LABELS_PATH, "r") as f:
    idx2Label = json.load(f)["idx2Label"]

img_transforms = transforms.ToTensor()

# inter_class_nms and iou functions implemented by GPT
def inter_class_nms(boxes, scores, iou_threshold=0.5):
    # Convert boxes and scores to torch tensors if they are not already
    boxes = torch.as_tensor(boxes)
    scores, class_indices = scores.max(dim=1)

    # Keep track of final boxes and scores
    final_boxes = []
    final_scores = []
    final_class_indices = []

    for class_index in range(scores.shape[1]):
        # Filter boxes and scores for the current class
        class_scores = scores[:, class_index]
        class_boxes = boxes

        # Indices of boxes sorted by score (highest first)
        sorted_indices = torch.argsort(class_scores, descending=True)

        while len(sorted_indices) > 0:
            # Take the box with the highest score
            highest_index = sorted_indices[0]
            highest_box = class_boxes[highest_index]

            # Add the highest box and score to the final list
            final_boxes.append(highest_box)
            final_scores.append(class_scores[highest_index])
            final_class_indices.append(class_index)

            # Remove the highest box from the list
            sorted_indices = sorted_indices[1:]

            # Compute IoU of the highest box with the rest
            ious = iou(class_boxes[sorted_indices], highest_box)

            # Keep only boxes with IoU less than the threshold
            sorted_indices = sorted_indices[ious < iou_threshold]

    return {'boxes': final_boxes, 'scores': final_scores}


def iou(boxes1, boxes2):
    """
    Compute the Intersection over Union (IoU) of two sets of boxes.

    Args:
    - boxes1 (Tensor[N, 4]): ground truth boxes
    - boxes2 (Tensor[M, 4]): predicted boxes

    Returns:
    - iou (Tensor[N, M]): the NxM matrix containing the pairwise IoU values for every element in boxes1 and boxes2
    """

    area1 = (boxes1[:, 2] - boxes1[:, 0]) * (boxes1[:, 3] - boxes1[:, 1])
    area2 = (boxes2[:, 2] - boxes2[:, 0]) * (boxes2[:, 3] - boxes2[:, 1])

    lt = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
    rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]

    wh = (rb - lt).clamp(min=0)  # [N,M,2]
    inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]

    iou = inter / (area1[:, None] + area2 - inter)

    return iou

def predict(img, conf_thresh=0.4):
    img_input = [img_transforms(img)]
    _, pred = model(img_input)
    pred = inter_class_nms(pred['boxes'], pred['scores'])
    out_img = img.copy()
    draw = ImageDraw.Draw(out_img)
    font = ImageFont.truetype("res/Tuffy_Bold.ttf", 25)
    for i in range(len(pred[0]['boxes'])):
        conf_score = pred[0]['scores'][i]
        if conf_score > conf_thresh:
            x1, y1, x2, y2 = pred[0]['boxes'][i]
            x1 = int(x1)
            y1 = int(y1)
            x2 = int(x2)
            y2 = int(y2)
            draw.rectangle([x1, y1, x2, y2], outline='red', width=3)

            text = idx2Label[str(int(pred[0]['labels'][i]))] + " {:.2f}".format(float(conf_score))

            bbox = draw.textbbox((x1, y1), text, font=font)
            draw.rectangle(bbox, fill="red")
            draw.text((x1, y1), text, font=font, fill="black")

    return out_img

example_imgs = [
    ["res/example.jpg", 0.4],
    ["res/screenlane-snapchat-profile.jpg", 0.4],
    ["res/screenlane-snapchat-settings.jpg", 0.4],
	["res/example_pair1.jpg", 0.4],
	["res/example_pair2.jpg", 0.4],
]

interface = gr.Interface(fn=predict, inputs=[gr.Image(type="pil", label="Screenshot"), gr.Slider(0.0, 1.0, step=0.1, value=0.4)], outputs=gr.Image(type="pil", label="Annotated Screenshot").style(height=600), examples=example_imgs)

interface.launch()