File size: 6,574 Bytes
b399279
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0b1188e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
# ai_test_generator/scraper.py
import time
import json
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
# from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import logging

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def extract_elements(url: str) -> list[dict]:
    """
    Scrapes a website URL to extract buttons, links, input fields, and forms.

    Args:
        url: The public URL of the website to scrape.

    Returns:
        A list of dictionaries, each representing an extracted UI element.
        Returns an empty list if scraping fails.
    """
    logging.info(f"Starting scraping for URL: {url}")
    extracted_elements = []

    chrome_options = Options()
    chrome_options.add_argument("--headless") # Run headless (no GUI)
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu") # Recommended for headless
    chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36") # Set user agent

    # service = Service(ChromeDriverManager().install())
    driver = None

    try:
        # --- Use the system's ChromeDriver ---
        # Specify the path to the driver installed via packages.txt
        chromedriver_path = "/usr/bin/chromedriver"
        logging.info(f"Attempting to use system chromedriver at: {chromedriver_path}")
        service = Service(executable_path=chromedriver_path)
        # --- End of change ---
        
        driver = webdriver.Chrome(service=service, options=chrome_options)
        driver.set_page_load_timeout(30) # Set timeout for page load
        driver.get(url)
        # Allow some time for dynamic content to potentially load
        # A more robust solution might use WebDriverWait
        time.sleep(3)

        page_source = driver.page_source
        soup = BeautifulSoup(page_source, 'lxml') # Use lxml parser

        # --- Extract Buttons ---
        buttons = soup.find_all('button')
        for btn in buttons:
            element_data = {
                'type': 'button',
                'text': btn.get_text(strip=True),
                'id': btn.get('id'),
                'name': btn.get('name'),
                'class': btn.get('class'),
                'attributes': {k: v for k, v in btn.attrs.items() if k not in ['id', 'name', 'class']}
            }
            extracted_elements.append(element_data)
        logging.info(f"Found {len(buttons)} buttons.")

        # --- Extract Links ---
        links = soup.find_all('a')
        for link in links:
            element_data = {
                'type': 'link',
                'text': link.get_text(strip=True),
                'href': link.get('href'),
                'id': link.get('id'),
                'class': link.get('class'),
                'attributes': {k: v for k, v in link.attrs.items() if k not in ['id', 'class', 'href']}
            }
            extracted_elements.append(element_data)
        logging.info(f"Found {len(links)} links.")

        # --- Extract Input Fields ---
        inputs = soup.find_all('input')
        for inp in inputs:
            element_data = {
                'type': 'input',
                'input_type': inp.get('type', 'text'), # Default to 'text' if type not specified
                'id': inp.get('id'),
                'name': inp.get('name'),
                'placeholder': inp.get('placeholder'),
                'value': inp.get('value'),
                'class': inp.get('class'),
                'attributes': {k: v for k, v in inp.attrs.items() if k not in ['id', 'name', 'class', 'type', 'placeholder', 'value']}
            }
            extracted_elements.append(element_data)
        logging.info(f"Found {len(inputs)} input fields.")

        # --- Extract Forms ---
        forms = soup.find_all('form')
        for form in forms:
            form_elements = []
            # Find elements within this specific form
            for child_input in form.find_all('input'):
                 form_elements.append({
                    'tag': 'input',
                    'type': child_input.get('type'),
                    'id': child_input.get('id'),
                    'name': child_input.get('name')
                 })
            for child_button in form.find_all('button'):
                 form_elements.append({
                    'tag': 'button',
                    'type': child_button.get('type'),
                    'id': child_button.get('id'),
                    'name': child_button.get('name'),
                    'text': child_button.get_text(strip=True)
                 })
            # Add other form element types if needed (select, textarea)

            element_data = {
                'type': 'form',
                'id': form.get('id'),
                'action': form.get('action'),
                'method': form.get('method'),
                'class': form.get('class'),
                'contained_elements': form_elements,
                'attributes': {k: v for k, v in form.attrs.items() if k not in ['id', 'class', 'action', 'method']}
            }
            extracted_elements.append(element_data)
        logging.info(f"Found {len(forms)} forms.")

        logging.info(f"Successfully extracted {len(extracted_elements)} elements in total.")

    except FileNotFoundError:
         logging.error(f"ERROR: System Chromedriver not found at {chromedriver_path}. Make sure 'chromium-driver' is in packages.txt.")

    except Exception as e:
        logging.error(f"Error during scraping URL {url}: {e}", exc_info=True)
        # Return empty list on error, Gradio app will handle this
        return []
    finally:
        if driver:
            driver.quit()
            logging.info("WebDriver closed.")

    return extracted_elements

# Example usage (optional, for testing scraper independently)
# if __name__ == '__main__':
#     test_url = "https://demoblaze.com/"
#     elements = extract_elements(test_url)
#     if elements:
#         print(f"Extracted {len(elements)} elements.")
#         # Save to a temporary file for inspection
#         with open("temp_elements.json", "w", encoding="utf-8") as f:
#             json.dump(elements, f, indent=4)
#         print("Saved results to temp_elements.json")
#     else:
#         print("Scraping failed.")