Spaces:

IndextDataLab
/

windows-ui-locator

Running

App Files Files Community

windows-ui-locator / app.py

woofah

Update app.py

f13e510 verified 17 days ago

raw

history blame contribute delete

5.26 kB

	"""Gradio demo for Local UI Locator — standalone HF Space version.

	Upload a Windows screenshot → detect interactive elements → view overlay + JSON.
	Self-contained: downloads model weights from HF Hub automatically.
	"""

	from __future__ import annotations

	import json
	from collections import Counter

	import cv2
	import gradio as gr
	import numpy as np
	from huggingface_hub import hf_hub_download
	from ultralytics import YOLO

	CLASS_NAMES = [
	"button", "textbox", "checkbox", "dropdown", "icon", "tab", "menu_item",
	]

	CLASS_COLORS = {
	"button": (255, 127, 0),
	"textbox": ( 0, 200, 0),
	"checkbox": ( 0, 127, 255),
	"dropdown": (200, 0, 200),
	"icon": ( 0, 150, 255),
	"tab": (255, 0, 100),
	"menu_item": (100, 255, 255),
	}

	# Download model weights from HF Hub on startup.
	_weights_path = hf_hub_download(
	repo_id="IndextDataLab/windows-ui-locator",
	filename="best.pt",
	)
	_model = YOLO(_weights_path)


	def _draw_overlay(img_rgb: np.ndarray, results: list[dict]) -> np.ndarray:
	overlay = cv2.cvtColor(img_rgb, cv2.COLOR_RGB2BGR)
	for r in results:
	x1, y1, x2, y2 = r["bbox"]
	color = CLASS_COLORS.get(r["type"], (200, 200, 200))
	label = f"{r['type']} {r['score']:.0%}"

	cv2.rectangle(overlay, (x1, y1), (x2, y2), color, 2)
	(tw, th), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)
	cv2.rectangle(overlay, (x1, y1 - th - 6), (x1 + tw + 4, y1), color, -1)
	cv2.putText(overlay, label, (x1 + 2, y1 - 4),
	cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1, cv2.LINE_AA)

	return cv2.cvtColor(overlay, cv2.COLOR_BGR2RGB)


	def detect(
	image: np.ndarray \| None,
	conf: float,
	iou: float,
	class_filter: list[str],
	) -> tuple[np.ndarray \| None, str, str]:
	if image is None:
	return None, "Upload an image first.", "[]"

	preds = _model.predict(
	source=image, conf=conf, iou=iou, verbose=False, max_det=300,
	)

	results = []
	if preds and len(preds) > 0:
	boxes = preds[0].boxes
	if boxes is not None:
	xyxy = boxes.xyxy.cpu().numpy()
	confs = boxes.conf.cpu().numpy()
	clss = boxes.cls.cpu().numpy().astype(int)

	for i, (box, c, cls_id) in enumerate(zip(xyxy, confs, clss)):
	cls_name = CLASS_NAMES[cls_id] if cls_id < len(CLASS_NAMES) else f"class_{cls_id}"
	if class_filter and cls_name not in class_filter:
	continue
	results.append({
	"id": i,
	"type": cls_name,
	"bbox": [int(box[0]), int(box[1]), int(box[2]), int(box[3])],
	"score": round(float(c), 4),
	"center": [int((box[0] + box[2]) / 2), int((box[1] + box[3]) / 2)],
	})

	overlay = _draw_overlay(image, results)

	counts = Counter(d["type"] for d in results)
	summary_parts = [f"{len(results)} elements detected"]
	for cls_name in sorted(counts):
	summary_parts.append(f"- {cls_name}: {counts[cls_name]}")

	return overlay, "\n".join(summary_parts), json.dumps(results, indent=2)


	with gr.Blocks(title="Windows UI Element Detector") as demo:
	gr.Markdown(
	"# Windows UI Element Detector\n"
	"Upload a Windows screenshot to detect interactive UI elements "
	"(buttons, textboxes, checkboxes, dropdowns, icons, tabs, menu items).\n\n"
	"Model: YOLO11s  \|  Classes: 7  \|  Dataset: 3 000 synthetic images"
	)

	with gr.Row():
	with gr.Column(scale=1):
	input_image = gr.Image(label="Screenshot", type="numpy")
	with gr.Row():
	conf_slider = gr.Slider(
	minimum=0.05, maximum=0.95, value=0.3, step=0.05,
	label="Confidence threshold",
	)
	iou_slider = gr.Slider(
	minimum=0.1, maximum=0.9, value=0.5, step=0.05,
	label="IoU threshold (NMS)",
	)
	class_filter = gr.CheckboxGroup(
	choices=CLASS_NAMES,
	label="Filter classes (empty = all)",
	)
	detect_btn = gr.Button("Detect", variant="primary")

	with gr.Column(scale=1):
	output_image = gr.Image(label="Detection overlay")
	summary_md = gr.Markdown(label="Summary")

	with gr.Accordion("JSON output", open=False):
	json_output = gr.Code(language="json", label="Detections JSON")

	detect_btn.click(
	fn=detect,
	inputs=[input_image, conf_slider, iou_slider, class_filter],
	outputs=[output_image, summary_md, json_output],
	)

	gr.Markdown(
	"---\n"
	"MIT License  \|  "
	"[GitHub](https://github.com/Indext-Data-Lab/windows-ui-synth)  \|  "
	"YOLO11s + EasyOCR + rapidfuzz  \|  "
	"Commission a similar tool or a fully integrated AI solution for your business -> "
	"[Visit indext.io](https://indext.io/) \| "
	"[Connect on LinkedIn](https://www.linkedin.com/company/indext-data-lab/)"
	)

	if __name__ == "__main__":
	demo.launch()