File size: 3,008 Bytes
7a89a47
 
 
 
afbbebb
d68018d
129e315
 
 
7a89a47
44c942e
 
 
 
32c5e0f
27d6b30
44c942e
e82b198
44c942e
f018237
129e315
f018237
 
129e315
 
d68018d
129e315
 
44c942e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129e315
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
from smolagents.tools import Tool
import cv2
import numpy as np
import os
import json
from typing import Optional
import logging

logger = logging.getLogger(__name__)

class DetectElementsTool(Tool):
    name = "detect_elements"
    description = "Detects table-like structures or text boxes in a screenshot using OpenCV."
    inputs = {
        "screenshot_path": {"type": "string", "nullable": True, "description": "Path to the screenshot"},
        "element_type": {"type": "string", "default": "table", "nullable": False, "description": "Type: 'table' or 'textbox'"}
    }
    output_type = "string"

    def __init__(self, driver=None):
        super().__init__()
        self.driver = driver  # Store driver for consistency, even if unused
        self.is_initialized = True  # No dependency on driver, so always True
        logger.debug(f"DetectElementsTool initialized: is_initialized={self.is_initialized}")

    def forward(self, screenshot_path: Optional[str], element_type="table"):
        if not self.is_initialized:
            return "Error: DetectElementsTool is not initialized"
        try:
            if not os.path.exists(screenshot_path):
                return f"Screenshot not found: {screenshot_path}"

            # Read and preprocess image
            image = cv2.imread(screenshot_path)
            gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
            blurred = cv2.GaussianBlur(gray, (5, 5), 0)
            edges = cv2.Canny(blurred, 50, 150)

            # Detect contours
            contours, _ = cv2.findContours(edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
            detections = []

            for contour in contours:
                x, y, w, h = cv2.boundingRect(contour)
                area = w * h
                aspect_ratio = w / h if h > 0 else 0

                # Filter for tables (rectangular, large area)
                if element_type == "table" and area > 10000 and 0.5 < aspect_ratio < 2.0:
                    detections.append({"type": "table", "bbox": [x, y, w, h]})
                # Filter for text boxes (narrow, horizontal)
                elif element_type == "textbox" and area > 500 and aspect_ratio > 2.0:
                    detections.append({"type": "textbox", "bbox": [x, y, w, h]})

            # Draw bounding boxes on a copy of the image
            output_path = screenshot_path.replace(".png", "_detected.png")
            output_image = image.copy()
            for detection in detections:
                x, y, w, h = detection["bbox"]
                color = (0, 255, 0) if detection["type"] == "table" else (0, 0, 255)
                cv2.rectangle(output_image, (x, y), (x + w, y + h), color, 2)
            cv2.imwrite(output_path, output_image)

            return json.dumps({
                "detections": detections,
                "output_image": output_path
            }) if detections else "No elements detected"
        except Exception as e:
            return f"Failed to detect elements: {str(e)}"