import spaces import re from typing import Tuple, Optional import gradio as gr import numpy as np from PIL import Image, ImageDraw, ImageFont from smolvlm_inference import TransformersModel from prompt import OS_SYSTEM_PROMPT # --- Configuration --- MODEL_ID = "smolagents/SmolVLM2-2.2B-Instruct-Agentic-GUI" # --- Model and Processor Loading (Load once) --- print(f"Loading model and processor for {MODEL_ID}...") model = None processor = None model_loaded = False load_error_message = "" model = TransformersModel( model_id=MODEL_ID, to_device="cuda:0", ) title = "Smol2Operator Demo" description = """ This is a demo of the Smol2Operator model designed to interact with graphical user interfaces (GUIs) and perform actions within them. This proof-of-concept (POC) version, described in [blogpost], showcases the model’s core capabilities. This compact release is intentionally scoped to fundamental tasks, with complex workflows planned for future iterations. :hugging_face: """ SYSTEM_PROMPT: str = OS_SYSTEM_PROMPT def get_navigation_prompt(task, image, step=1): """ Get the prompt for the navigation task. - task: The task to complete - image: The current screenshot of the web page - step: The current step of the task """ system_prompt = SYSTEM_PROMPT return [ { "role": "system", "content": [ {"type": "text", "text": system_prompt}, ], }, { "role": "user", "content": [ { "type": "image", "image": image, }, {"type": "text", "text": f"Please generate the next move according to the UI screenshot, instruction and previous actions.\n\nInstruction: {task}\n\nPrevious actions:\nNone"}, ], }, ] def array_to_image(image_array: np.ndarray) -> Image.Image: if image_array is None: raise ValueError("No image provided. Please upload an image before submitting.") # Convert numpy array to PIL Image img = Image.fromarray(np.uint8(image_array)) return img def parse_actions_from_response(response: str) -> list[str]: """Parse actions from model response using regex pattern.""" pattern = r"\n(.*?)\n" matches = re.findall(pattern, response, re.DOTALL) return matches def extract_coordinates_from_action(action_code: str) -> list[dict]: """Extract coordinates from action code for localization actions.""" localization_actions = [] # Patterns for different action types patterns = { 'click': r'click\((?:x=)?([0-9.]+)(?:,\s*(?:y=)?([0-9.]+))?\)', 'double_click': r'double_click\((?:x=)?([0-9.]+)(?:,\s*(?:y=)?([0-9.]+))?\)', 'move_mouse': r'move_mouse\((?:self,\s*)?(?:x=)?([0-9.]+)(?:,\s*(?:y=)?([0-9.]+))\)', 'drag': r'drag\(\[([0-9.]+),\s*([0-9.]+)\],\s*\[([0-9.]+),\s*([0-9.]+)\]\)' } for action_type, pattern in patterns.items(): matches = re.finditer(pattern, action_code) for match in matches: if action_type == 'drag': # Drag has from and to coordinates from_x, from_y, to_x, to_y = match.groups() localization_actions.append({ 'type': 'drag_from', 'x': float(from_x), 'y': float(from_y), 'action': action_type }) localization_actions.append({ 'type': 'drag_to', 'x': float(to_x), 'y': float(to_y), 'action': action_type }) else: # Single coordinate actions x_val = match.group(1) y_val = match.group(2) if match.group(2) else x_val # Handle single coordinate case if x_val and y_val: localization_actions.append({ 'type': action_type, 'x': float(x_val), 'y': float(y_val), 'action': action_type }) return localization_actions def create_localized_image(original_image: Image.Image, coordinates: list[dict]) -> Optional[Image.Image]: """Create an image with localization markers drawn on it.""" if not coordinates: return None # Create a copy of the original image img_copy = original_image.copy() draw = ImageDraw.Draw(img_copy) # Get image dimensions width, height = img_copy.size # Try to load a font, fallback to default if not available font = ImageFont.load_default() # Color scheme for different actions colors = { 'click': 'red', 'double_click': 'blue', 'move_mouse': 'green', 'drag_from': 'orange', 'drag_to': 'purple' } for i, coord in enumerate(coordinates): # Convert normalized coordinates to pixel coordinates pixel_x = int(coord['x'] * width) pixel_y = int(coord['y'] * height) # Get color for this action type color = colors.get(coord['type'], 'red') # Draw a circle at the coordinate circle_radius = 8 draw.ellipse([ pixel_x - circle_radius, pixel_y - circle_radius, pixel_x + circle_radius, pixel_y + circle_radius ], fill=color, outline='white', width=2) # Add text label label = f"{coord['type']}({coord['x']:.2f},{coord['y']:.2f})" if font: draw.text((pixel_x + 10, pixel_y - 10), label, fill=color, font=font) else: draw.text((pixel_x + 10, pixel_y - 10), label, fill=color) # For drag actions, draw an arrow if coord['type'] == 'drag_from' and i + 1 < len(coordinates) and coordinates[i + 1]['type'] == 'drag_to': next_coord = coordinates[i + 1] end_x = int(next_coord['x'] * width) end_y = int(next_coord['y'] * height) # Draw arrow line draw.line([pixel_x, pixel_y, end_x, end_y], fill='orange', width=3) # Draw arrowhead arrow_size = 10 dx = end_x - pixel_x dy = end_y - pixel_y length = (dx**2 + dy**2)**0.5 if length > 0: dx_norm = dx / length dy_norm = dy / length # Arrowhead points arrow_x1 = end_x - arrow_size * dx_norm + arrow_size * dy_norm * 0.5 arrow_y1 = end_y - arrow_size * dy_norm - arrow_size * dx_norm * 0.5 arrow_x2 = end_x - arrow_size * dx_norm - arrow_size * dy_norm * 0.5 arrow_y2 = end_y - arrow_size * dy_norm + arrow_size * dx_norm * 0.5 draw.polygon([end_x, end_y, arrow_x1, arrow_y1, arrow_x2, arrow_y2], fill='orange') return img_copy # --- Gradio processing function --- @spaces.GPU def navigate(input_numpy_image: np.ndarray, task: str) -> Tuple[str, Optional[Image.Image]]: input_pil_image = array_to_image(input_numpy_image) assert isinstance(input_pil_image, Image.Image) prompt = get_navigation_prompt(task, input_pil_image) if model is None: raise ValueError("Model not loaded") navigation_str = model.generate(prompt, max_new_tokens=500) print(f"Navigation string: {navigation_str}") navigation_str = navigation_str.strip() # Parse actions from the response actions = parse_actions_from_response(navigation_str) # Extract coordinates from all actions all_coordinates = [] for action_code in actions: coordinates = extract_coordinates_from_action(action_code) all_coordinates.extend(coordinates) # Create localized image if there are coordinates localized_image = None if all_coordinates: localized_image = create_localized_image(input_pil_image, all_coordinates) print(f"Found {len(all_coordinates)} localization actions") return navigation_str, localized_image # --- Load Example Data --- example_1_image: str = "./assets/google.png" example_1_image = Image.open(example_1_image) example_1_task = "Search for the name of the current UK Prime Minister." example_2_image: str = "./assets/huggingface.png" example_2_image = Image.open(example_2_image) example_2_task = "Find the most trending model." with gr.Blocks(theme=gr.themes.Soft()) as demo: gr.Markdown(f"

{title}

") # gr.Markdown(description) with gr.Row(): input_image_component = gr.Image(label="UI Image", height=500) with gr.Row(): with gr.Column(): task_component = gr.Textbox( label="task", placeholder="e.g., Search for the name of the current UK Prime Minister.", info="Type the task you want the model to complete.", ) submit_button = gr.Button("Call Agent", variant="primary") with gr.Column(): output_coords_component = gr.Textbox(label="Agent Output", lines=10) submit_button.click(navigate, [input_image_component, task_component], [output_coords_component, input_image_component]) gr.Examples( examples=[[example_1_image, example_1_task], [example_2_image, example_2_task]], inputs=[input_image_component, task_component], outputs=[output_coords_component, input_image_component], fn=navigate, cache_examples=True, ) demo.queue(api_open=False) demo.launch(debug=True)