Spaces:
Runtime error
Runtime error
taskswithcode
commited on
Commit
•
b0e19ad
1
Parent(s):
def93e8
Added
Browse files- app.py +134 -0
- assets/Hidden_object_game_scaled.png +0 -0
- assets/bus_ovd.jpg +0 -0
- assets/calf.png +0 -0
- assets/road_signs.png +0 -0
- long_form_logo_with_icon.png +0 -0
- requirements.txt +8 -0
- run.sh +1 -0
app.py
ADDED
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import cv2
|
3 |
+
import gradio as gr
|
4 |
+
import numpy as np
|
5 |
+
from transformers import OwlViTProcessor, OwlViTForObjectDetection
|
6 |
+
import pdb
|
7 |
+
from collections import OrderedDict
|
8 |
+
|
9 |
+
|
10 |
+
# Use GPU if available
|
11 |
+
if torch.cuda.is_available():
|
12 |
+
device = torch.device("cuda")
|
13 |
+
else:
|
14 |
+
device = torch.device("cpu")
|
15 |
+
|
16 |
+
model = OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch32").to(device)
|
17 |
+
model.eval()
|
18 |
+
processor = OwlViTProcessor.from_pretrained("google/owlvit-base-patch32")
|
19 |
+
|
20 |
+
|
21 |
+
def query_image(img, text_queries,max_results):
|
22 |
+
text_queries = text_queries
|
23 |
+
text_queries = text_queries.split(",")
|
24 |
+
|
25 |
+
target_sizes = torch.Tensor([img.shape[:2]])
|
26 |
+
inputs = processor(text=text_queries, images=img, return_tensors="pt").to(device)
|
27 |
+
|
28 |
+
with torch.no_grad():
|
29 |
+
outputs = model(**inputs)
|
30 |
+
|
31 |
+
outputs.logits = outputs.logits.cpu()
|
32 |
+
outputs.pred_boxes = outputs.pred_boxes.cpu()
|
33 |
+
results = processor.post_process(outputs=outputs, target_sizes=target_sizes)
|
34 |
+
boxes, scores, labels = results[0]["boxes"], results[0]["scores"], results[0]["labels"]
|
35 |
+
results_dict = {}
|
36 |
+
for box, score, label in zip(boxes, scores, labels):
|
37 |
+
results_dict[score] = {"box":box,"label":label}
|
38 |
+
sorted_results_dict = OrderedDict(sorted(results_dict.items(),reverse=True))
|
39 |
+
|
40 |
+
|
41 |
+
font = cv2.FONT_HERSHEY_SIMPLEX
|
42 |
+
|
43 |
+
score_dist = []
|
44 |
+
count = 0
|
45 |
+
for score in sorted_results_dict:
|
46 |
+
score_dist.append(round(score.tolist(),2))
|
47 |
+
count += 1
|
48 |
+
if (count == 10):
|
49 |
+
break
|
50 |
+
|
51 |
+
|
52 |
+
#for box, score, label in zip(boxes, scores, labels):
|
53 |
+
result_count = 0
|
54 |
+
for score in sorted_results_dict:
|
55 |
+
box = sorted_results_dict[score]["box"]
|
56 |
+
label = sorted_results_dict[score]["label"]
|
57 |
+
box = [int(i) for i in box.tolist()]
|
58 |
+
|
59 |
+
print("label:",label,"score:",score)
|
60 |
+
#if score >= score_threshold:
|
61 |
+
img = cv2.rectangle(img, box[:2], box[2:], (255,0,0), 1)
|
62 |
+
if box[3] + 25 > 768:
|
63 |
+
y = box[3] - 10
|
64 |
+
else:
|
65 |
+
y = box[3] + 25
|
66 |
+
|
67 |
+
rounded_score = round(score.tolist(),2)
|
68 |
+
img = cv2.putText(
|
69 |
+
img, f"({rounded_score}):{text_queries[label]}", (box[0], y), font, .5, (255,0,0), 1, cv2.LINE_AA
|
70 |
+
)
|
71 |
+
result_count += 1
|
72 |
+
if (result_count >= max_results):
|
73 |
+
break
|
74 |
+
return (img,f"Top {count} score confidences:{str(score_dist)}")
|
75 |
+
|
76 |
+
|
77 |
+
description = """
|
78 |
+
<div style=\"font-size:18px; color: #2f2f2f; text-align: center\">
|
79 |
+
</i>This app is a tweaked variation of <a href="https://huggingface.co/spaces/adirik/OWL-ViT">Alara Dirik's OWL-ViT demo</a>
|
80 |
+
</i></div>
|
81 |
+
<div style=\"font-size:18px; color: #2f2f2f; text-align: left\">
|
82 |
+
<b>Use cases of this model</b>
|
83 |
+
<br/>1) Given an image with an object, detect it. <i>(e.g. Where is Waldo? app)</i>
|
84 |
+
<br/>2) Given an image with multiple instances of an object, detect them <i>(e.g. labeling tool assistance for bounding box annotation)</i>
|
85 |
+
<br/>3) Find an object within an image using either text or image as input <i>(e.g. Image Search app - this would require pruning candidates using a threshold and using the score distribution in the output. Search using an input image could be useful when trying to find things that are hard to describe in text like a machine part)</i>
|
86 |
+
<br/><div style=\"font-size:16px; color: #3f3f3f; text-align: left\">
|
87 |
+
<br/>Links to apps/notebooks of other SOTA models for open vocabulary object detection or zero-shot object detection
|
88 |
+
<br/>a) <a href="https://huggingface.co/spaces/CVPR/regionclip-demo">RegionCLIP</a>
|
89 |
+
<br/>b) <a href="https://colab.research.google.com/drive/19LBqQg0cS36rTLL_TaXZ7Ka9KJGkxiSe?usp=sharing">Colab notebook for Object-Centric-OVD</a>
|
90 |
+
</div>
|
91 |
+
<br/><div style=\"font-size:16px; color: #4f4f4f; text-align: left\">Note: While most examples showcased illustrate model capabilities, some illustrate model's limitations - such as finding globe,bird cage,teapot etc. in the picture etc. Also, the model appears to have text detection and recognition capabilities, even if text recognition is only very limited</div>
|
92 |
+
<div style=\"font-size:14px; color: #6f6f6f; text-align: left\"><i>Images below are from <a href="https://en.wikipedia.org/wiki/Hidden_object_game">Wikipedia</a>, <a href="http://images.cocodataset.org/val2017/000000133819.jpg">COCO</a> and <a href="http://host.robots.ox.ac.uk/pascal/VOC/voc2012/">PASCAL VOC 2012</a> datasets </i></div>
|
93 |
+
"""
|
94 |
+
demo = gr.Interface(
|
95 |
+
query_image,
|
96 |
+
inputs=[gr.Image(), "text",gr.Slider(1, 10, value=1)],
|
97 |
+
outputs=["image","text"],
|
98 |
+
server_port=80,
|
99 |
+
server_name="0.0.0.0",
|
100 |
+
title="Where is Waldo? <i>(implemented with OWL-ViT)</i>",
|
101 |
+
description=description,
|
102 |
+
examples=[
|
103 |
+
["assets/Hidden_object_game_scaled.png", "bicycle", 1],
|
104 |
+
["assets/Hidden_object_game_scaled.png", "laptop", 1],
|
105 |
+
["assets/Hidden_object_game_scaled.png", "abacus", 1],
|
106 |
+
["assets/Hidden_object_game_scaled.png", "frog", 1],
|
107 |
+
["assets/Hidden_object_game_scaled.png", "bird cage", 2],
|
108 |
+
["assets/Hidden_object_game_scaled.png", "globe", 2],
|
109 |
+
["assets/Hidden_object_game_scaled.png", "teapot", 3],
|
110 |
+
["assets/bus_ovd.jpg", "license plate", 1],
|
111 |
+
["assets/bus_ovd.jpg", "sign saying ARRIVA", 1],
|
112 |
+
["assets/bus_ovd.jpg", "sign saying ARRIVAL", 1],
|
113 |
+
["assets/bus_ovd.jpg", "crossing push button", 1],
|
114 |
+
["assets/bus_ovd.jpg", "building on moutain", 2],
|
115 |
+
["assets/bus_ovd.jpg", "road marking", 3],
|
116 |
+
["assets/bus_ovd.jpg", "mirror", 1],
|
117 |
+
["assets/bus_ovd.jpg", "traffic camera", 1],
|
118 |
+
["assets/bus_ovd.jpg", "red bus,blue bus", 2],
|
119 |
+
["assets/calf.png", "snout,tail", 1],
|
120 |
+
["assets/calf.png", "hoof", 4],
|
121 |
+
["assets/calf.png", "ear", 2],
|
122 |
+
["assets/calf.png", "tag", 1],
|
123 |
+
["assets/calf.png", "hay", 1],
|
124 |
+
["assets/calf.png", "barbed wire", 1],
|
125 |
+
["assets/calf.png", "grass", 1],
|
126 |
+
["assets/calf.png", "can", 2],
|
127 |
+
["assets/road_signs.png", "STOP", 1],
|
128 |
+
["assets/road_signs.png", "STOP sign", 1],
|
129 |
+
["assets/road_signs.png", "arrow", 1],
|
130 |
+
["assets/road_signs.png", "ROAD", 1],
|
131 |
+
["assets/road_signs.png", "triangle", 1],
|
132 |
+
],
|
133 |
+
)
|
134 |
+
demo.launch(share=True)
|
assets/Hidden_object_game_scaled.png
ADDED
assets/bus_ovd.jpg
ADDED
assets/calf.png
ADDED
assets/road_signs.png
ADDED
long_form_logo_with_icon.png
ADDED
requirements.txt
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# pip install -r requirements.txt
|
2 |
+
|
3 |
+
gradio
|
4 |
+
numpy>=1.18.5
|
5 |
+
torch>=1.7.0
|
6 |
+
torchvision>=0.8.1
|
7 |
+
git+https://github.com/huggingface/transformers.git
|
8 |
+
opencv-python
|
run.sh
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
python app.py
|