Spaces:
Running
Running
File size: 3,008 Bytes
7a89a47 afbbebb d68018d 129e315 7a89a47 44c942e 32c5e0f 27d6b30 44c942e e82b198 44c942e f018237 129e315 f018237 129e315 d68018d 129e315 44c942e 129e315 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 |
from smolagents.tools import Tool
import cv2
import numpy as np
import os
import json
from typing import Optional
import logging
logger = logging.getLogger(__name__)
class DetectElementsTool(Tool):
name = "detect_elements"
description = "Detects table-like structures or text boxes in a screenshot using OpenCV."
inputs = {
"screenshot_path": {"type": "string", "nullable": True, "description": "Path to the screenshot"},
"element_type": {"type": "string", "default": "table", "nullable": False, "description": "Type: 'table' or 'textbox'"}
}
output_type = "string"
def __init__(self, driver=None):
super().__init__()
self.driver = driver # Store driver for consistency, even if unused
self.is_initialized = True # No dependency on driver, so always True
logger.debug(f"DetectElementsTool initialized: is_initialized={self.is_initialized}")
def forward(self, screenshot_path: Optional[str], element_type="table"):
if not self.is_initialized:
return "Error: DetectElementsTool is not initialized"
try:
if not os.path.exists(screenshot_path):
return f"Screenshot not found: {screenshot_path}"
# Read and preprocess image
image = cv2.imread(screenshot_path)
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
blurred = cv2.GaussianBlur(gray, (5, 5), 0)
edges = cv2.Canny(blurred, 50, 150)
# Detect contours
contours, _ = cv2.findContours(edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
detections = []
for contour in contours:
x, y, w, h = cv2.boundingRect(contour)
area = w * h
aspect_ratio = w / h if h > 0 else 0
# Filter for tables (rectangular, large area)
if element_type == "table" and area > 10000 and 0.5 < aspect_ratio < 2.0:
detections.append({"type": "table", "bbox": [x, y, w, h]})
# Filter for text boxes (narrow, horizontal)
elif element_type == "textbox" and area > 500 and aspect_ratio > 2.0:
detections.append({"type": "textbox", "bbox": [x, y, w, h]})
# Draw bounding boxes on a copy of the image
output_path = screenshot_path.replace(".png", "_detected.png")
output_image = image.copy()
for detection in detections:
x, y, w, h = detection["bbox"]
color = (0, 255, 0) if detection["type"] == "table" else (0, 0, 255)
cv2.rectangle(output_image, (x, y), (x + w, y + h), color, 2)
cv2.imwrite(output_path, output_image)
return json.dumps({
"detections": detections,
"output_image": output_path
}) if detections else "No elements detected"
except Exception as e:
return f"Failed to detect elements: {str(e)}" |