Spaces:
Running
Running
File size: 5,019 Bytes
f0f166f 9401db3 f0f166f f70ac2a f0f166f 9401db3 f0f166f 9401db3 f0f166f f70ac2a 66c25fa f0f166f 9401db3 f70ac2a 9401db3 30c7b4f 9401db3 f70ac2a f0f166f 9401db3 f0f166f f70ac2a f0f166f f70ac2a f0f166f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 |
import gradio as gr
import PIL.Image
import torch
from transformers import AutoModelForZeroShotObjectDetection, AutoProcessor
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
class Detector:
def __init__(self, model_id: str):
self.device = DEVICE
self.processor = AutoProcessor.from_pretrained(model_id)
self.model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id).to(
self.device
)
def detect(
self,
image: PIL.Image.Image,
text_labels: list[str],
threshold: float = 0.4,
):
inputs = self.processor(
images=image, text=[text_labels], return_tensors="pt"
).to(self.device)
with torch.no_grad():
outputs = self.model(**inputs)
results = self.processor.post_process_grounded_object_detection(
outputs, threshold=threshold, target_sizes=[(image.height, image.width)]
)
detections = []
result = results[0]
for box, score, labels in zip(
result["boxes"], result["scores"], result["text_labels"]
):
box = [round(x, 2) for x in box.tolist()]
detections.append(
dict(
label=labels,
confidence=round(score.item(), 3),
box=box,
)
)
return detections
models = dict(
tiny=Detector("iSEE-Laboratory/llmdet_tiny"),
base=Detector("iSEE-Laboratory/llmdet_base"),
large=Detector("iSEE-Laboratory/llmdet_large"),
)
def _postprocess(detections):
annotations = []
for detection in detections:
box = detection["box"]
mask = (int(box[0]), int(box[1]), int(box[2]), int(box[3]))
label = f"{detection['label']} ({detection['confidence']:.2f})"
annotations.append((mask, label))
return annotations
def detect_objects(image, labels, confidence_threshold):
labels = [label.strip() for label in labels.split(",")]
detections = []
for model_name in models.keys():
detection = models[model_name].detect(
image,
labels,
threshold=confidence_threshold,
)
detections.append(_postprocess(detection))
return tuple((image, det) for det in detections)
with gr.Blocks(delete_cache=(5, 10)) as demo:
gr.Markdown(
"# LLMDet Arena ✨\n ### [Paper](https://arxiv.org/abs/2501.18954) - [Repository](https://github.com/iSEE-Laboratory/LLMDet)"
)
with gr.Row():
with gr.Column():
gr.Markdown("## Input Image")
image_input = gr.Image(type="pil", image_mode="RGB", format="jpeg")
with gr.Column():
gr.Markdown("## Settings")
confidence_slider = gr.Slider(
0,
1,
value=0.3,
step=0.01,
interactive=True,
label="Confidence threshold:",
)
labels = ["a cat", "a remote control"]
text_input = gr.Textbox(
label="Object labels (comma separated):",
placeholder=",".join(labels),
lines=1,
)
with gr.Row():
detect_button = gr.Button("Detect Objects")
with gr.Row():
gr.Markdown("## Output Annotated Images")
with gr.Row():
output_annotated_image_tiny = gr.AnnotatedImage(label="TINY", format="jpeg")
output_annotated_image_base = gr.AnnotatedImage(label="BASE", format="jpeg")
output_annotated_image_large = gr.AnnotatedImage(label="LARGE", format="jpeg")
# Connect the button to the detection function
detect_button.click(
fn=detect_objects,
inputs=[image_input, text_input, confidence_slider],
outputs=[
output_annotated_image_tiny,
output_annotated_image_base,
output_annotated_image_large,
],
)
with gr.Row():
gr.Markdown("## Examples")
with gr.Row():
gr.Examples(
examples=[
[
"http://images.cocodataset.org/val2017/000000039769.jpg",
"a cat, a remote control",
0.3,
],
[
"http://images.cocodataset.org/val2017/000000370486.jpg",
"a person",
0.3,
],
[
"http://images.cocodataset.org/train2017/000000345263.jpg",
"a red apple, a green apple",
0.3,
],
],
inputs=[image_input, text_input, confidence_slider],
outputs=[
output_annotated_image_tiny,
output_annotated_image_base,
output_annotated_image_large,
],
fn=detect_objects,
cache_examples=True,
)
if __name__ == "__main__":
demo.launch()
|