amir.mahla@huggingface.co commited on
Commit
d32faf0
·
1 Parent(s): 3fe2480

ADD new app

Browse files
Files changed (6) hide show
  1. README.md +4 -4
  2. app.py +280 -0
  3. assets/google.png +0 -0
  4. prompt.py +143 -0
  5. requirements.txt +8 -0
  6. smolvlm_inference.py +23 -0
README.md CHANGED
@@ -1,10 +1,10 @@
1
  ---
2
  title: Smol2Operator Demo
3
- emoji: 🏆
4
- colorFrom: blue
5
- colorTo: yellow
6
  sdk: gradio
7
- sdk_version: 5.46.0
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
 
1
  ---
2
  title: Smol2Operator Demo
3
+ emoji: 🐢
4
+ colorFrom: purple
5
+ colorTo: green
6
  sdk: gradio
7
+ sdk_version: 5.44.1
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
app.py ADDED
@@ -0,0 +1,280 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from typing import Tuple, Optional
3
+
4
+ import gradio as gr
5
+ import numpy as np
6
+ from PIL import Image, ImageDraw, ImageFont
7
+ from smolvlm_inference import TransformersModel
8
+
9
+ from prompt import OS_SYSTEM_PROMPT
10
+
11
+ # --- Configuration ---
12
+ MODEL_ID = "smolagents/SmolVLM2-2.2B-Instruct-Agentic-GUI"
13
+
14
+ # --- Model and Processor Loading (Load once) ---
15
+ print(f"Loading model and processor for {MODEL_ID}...")
16
+ model = None
17
+ processor = None
18
+ model_loaded = False
19
+ load_error_message = ""
20
+
21
+
22
+
23
+ model = TransformersModel(
24
+ model_id=MODEL_ID,
25
+ to_device="cuda:0",
26
+ )
27
+
28
+
29
+ title = "Smol2Operator Demo"
30
+
31
+ description = """
32
+ This is a demo of the Smol2Operator model designed to interact with graphical user interfaces (GUIs) and perform actions within them.
33
+ This proof-of-concept (POC) version, described in [blogpost], showcases the model’s core capabilities.
34
+ This compact release is intentionally scoped to fundamental tasks, with complex workflows planned for future iterations. :hugging_face:
35
+ """
36
+
37
+
38
+
39
+ SYSTEM_PROMPT: str = OS_SYSTEM_PROMPT
40
+
41
+
42
+ def get_navigation_prompt(task, image, step=1):
43
+ """
44
+ Get the prompt for the navigation task.
45
+ - task: The task to complete
46
+ - image: The current screenshot of the web page
47
+ - step: The current step of the task
48
+ """
49
+ system_prompt = SYSTEM_PROMPT
50
+ return [
51
+ {
52
+ "role": "system",
53
+ "content": [
54
+ {"type": "text", "text": system_prompt},
55
+ ],
56
+ },
57
+ {
58
+ "role": "user",
59
+ "content": [
60
+ {
61
+ "type": "image",
62
+ "image": image,
63
+ },
64
+ {"type": "text", "text": f"Please generate the next move according to the UI screenshot, instruction and previous actions.\n\nInstruction: {task}\n\nPrevious actions:\nNone"},
65
+ ],
66
+ },
67
+ ]
68
+
69
+
70
+ def array_to_image(image_array: np.ndarray) -> Image.Image:
71
+ if image_array is None:
72
+ raise ValueError("No image provided. Please upload an image before submitting.")
73
+ # Convert numpy array to PIL Image
74
+ img = Image.fromarray(np.uint8(image_array))
75
+ return img
76
+
77
+
78
+ def parse_actions_from_response(response: str) -> list[str]:
79
+ """Parse actions from model response using regex pattern."""
80
+ pattern = r"<code>\n(.*?)\n</code>"
81
+ matches = re.findall(pattern, response, re.DOTALL)
82
+ return matches
83
+
84
+
85
+ def extract_coordinates_from_action(action_code: str) -> list[dict]:
86
+ """Extract coordinates from action code for localization actions."""
87
+ localization_actions = []
88
+
89
+ # Patterns for different action types
90
+ patterns = {
91
+ 'click': r'click\((?:x=)?([0-9.]+)(?:,\s*(?:y=)?([0-9.]+))?\)',
92
+ 'double_click': r'double_click\((?:x=)?([0-9.]+)(?:,\s*(?:y=)?([0-9.]+))?\)',
93
+ 'move_mouse': r'move_mouse\((?:self,\s*)?(?:x=)?([0-9.]+)(?:,\s*(?:y=)?([0-9.]+))\)',
94
+ 'drag': r'drag\(\[([0-9.]+),\s*([0-9.]+)\],\s*\[([0-9.]+),\s*([0-9.]+)\]\)'
95
+ }
96
+
97
+ for action_type, pattern in patterns.items():
98
+ matches = re.finditer(pattern, action_code)
99
+ for match in matches:
100
+ if action_type == 'drag':
101
+ # Drag has from and to coordinates
102
+ from_x, from_y, to_x, to_y = match.groups()
103
+ localization_actions.append({
104
+ 'type': 'drag_from',
105
+ 'x': float(from_x),
106
+ 'y': float(from_y),
107
+ 'action': action_type
108
+ })
109
+ localization_actions.append({
110
+ 'type': 'drag_to',
111
+ 'x': float(to_x),
112
+ 'y': float(to_y),
113
+ 'action': action_type
114
+ })
115
+ else:
116
+ # Single coordinate actions
117
+ x_val = match.group(1)
118
+ y_val = match.group(2) if match.group(2) else x_val # Handle single coordinate case
119
+ if x_val and y_val:
120
+ localization_actions.append({
121
+ 'type': action_type,
122
+ 'x': float(x_val),
123
+ 'y': float(y_val),
124
+ 'action': action_type
125
+ })
126
+
127
+ return localization_actions
128
+
129
+
130
+ def create_localized_image(original_image: Image.Image, coordinates: list[dict]) -> Optional[Image.Image]:
131
+ """Create an image with localization markers drawn on it."""
132
+ if not coordinates:
133
+ return None
134
+
135
+ # Create a copy of the original image
136
+ img_copy = original_image.copy()
137
+ draw = ImageDraw.Draw(img_copy)
138
+
139
+ # Get image dimensions
140
+ width, height = img_copy.size
141
+
142
+ # Try to load a font, fallback to default if not available
143
+ font = ImageFont.load_default()
144
+
145
+
146
+ # Color scheme for different actions
147
+ colors = {
148
+ 'click': 'red',
149
+ 'double_click': 'blue',
150
+ 'move_mouse': 'green',
151
+ 'drag_from': 'orange',
152
+ 'drag_to': 'purple'
153
+ }
154
+
155
+ for i, coord in enumerate(coordinates):
156
+ # Convert normalized coordinates to pixel coordinates
157
+ pixel_x = int(coord['x'] * width)
158
+ pixel_y = int(coord['y'] * height)
159
+
160
+ # Get color for this action type
161
+ color = colors.get(coord['type'], 'red')
162
+
163
+ # Draw a circle at the coordinate
164
+ circle_radius = 8
165
+ draw.ellipse([
166
+ pixel_x - circle_radius, pixel_y - circle_radius,
167
+ pixel_x + circle_radius, pixel_y + circle_radius
168
+ ], fill=color, outline='white', width=2)
169
+
170
+ # Add text label
171
+ label = f"{coord['type']}({coord['x']:.2f},{coord['y']:.2f})"
172
+ if font:
173
+ draw.text((pixel_x + 10, pixel_y - 10), label, fill=color, font=font)
174
+ else:
175
+ draw.text((pixel_x + 10, pixel_y - 10), label, fill=color)
176
+
177
+ # For drag actions, draw an arrow
178
+ if coord['type'] == 'drag_from' and i + 1 < len(coordinates) and coordinates[i + 1]['type'] == 'drag_to':
179
+ next_coord = coordinates[i + 1]
180
+ end_x = int(next_coord['x'] * width)
181
+ end_y = int(next_coord['y'] * height)
182
+
183
+ # Draw arrow line
184
+ draw.line([pixel_x, pixel_y, end_x, end_y], fill='orange', width=3)
185
+
186
+ # Draw arrowhead
187
+ arrow_size = 10
188
+ dx = end_x - pixel_x
189
+ dy = end_y - pixel_y
190
+ length = (dx**2 + dy**2)**0.5
191
+ if length > 0:
192
+ dx_norm = dx / length
193
+ dy_norm = dy / length
194
+
195
+ # Arrowhead points
196
+ arrow_x1 = end_x - arrow_size * dx_norm + arrow_size * dy_norm * 0.5
197
+ arrow_y1 = end_y - arrow_size * dy_norm - arrow_size * dx_norm * 0.5
198
+ arrow_x2 = end_x - arrow_size * dx_norm - arrow_size * dy_norm * 0.5
199
+ arrow_y2 = end_y - arrow_size * dy_norm + arrow_size * dx_norm * 0.5
200
+
201
+ draw.polygon([end_x, end_y, arrow_x1, arrow_y1, arrow_x2, arrow_y2], fill='orange')
202
+
203
+ return img_copy
204
+
205
+
206
+ # --- Gradio processing function ---
207
+ def navigate(input_numpy_image: np.ndarray, task: str) -> Tuple[str, Optional[Image.Image]]:
208
+ input_pil_image = array_to_image(input_numpy_image)
209
+ assert isinstance(input_pil_image, Image.Image)
210
+
211
+ prompt = get_navigation_prompt(task, input_pil_image)
212
+
213
+ print("Prompt:")
214
+ print(prompt)
215
+
216
+ if model is None:
217
+ raise ValueError("Model not loaded")
218
+
219
+ navigation_str = model.generate(prompt, max_new_tokens=500)
220
+ print(f"Navigation string: {navigation_str}")
221
+ navigation_str = navigation_str.strip()
222
+
223
+ # Parse actions from the response
224
+ actions = parse_actions_from_response(navigation_str)
225
+
226
+ # Extract coordinates from all actions
227
+ all_coordinates = []
228
+ for action_code in actions:
229
+ coordinates = extract_coordinates_from_action(action_code)
230
+ all_coordinates.extend(coordinates)
231
+
232
+ # Create localized image if there are coordinates
233
+ localized_image = None
234
+ if all_coordinates:
235
+ localized_image = create_localized_image(input_pil_image, all_coordinates)
236
+ print(f"Found {len(all_coordinates)} localization actions")
237
+
238
+ return navigation_str, localized_image
239
+
240
+
241
+ # --- Load Example Data ---
242
+ example_1_image: str = "./assets/google.png"
243
+ example_1_image = Image.open(example_1_image)
244
+ example_1_task = "Search for the name of the current UK Prime Minister."
245
+
246
+ example_2_image: str = "./assets/huggingface.png"
247
+ example_2_image = Image.open(example_2_image)
248
+ example_2_task = "Find the most trending model."
249
+
250
+
251
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
252
+ gr.Markdown(f"<h1 style='text-align: center;'>{title}</h1>")
253
+ # gr.Markdown(description)
254
+
255
+ with gr.Row():
256
+ with gr.Column():
257
+ input_image_component = gr.Image(label="Input UI Image", height=400)
258
+ task_component = gr.Textbox(
259
+ label="task",
260
+ placeholder="e.g., Find the latest model by H Company",
261
+ info="Type the task you want the model to complete.",
262
+ )
263
+ submit_button = gr.Button("Navigate", variant="primary")
264
+
265
+ with gr.Column():
266
+ localization_image_component = gr.Image(label="Action Localization", height=400)
267
+ output_coords_component = gr.Textbox(label="Agent Output", lines=20)
268
+
269
+ submit_button.click(navigate, [input_image_component, task_component], [output_coords_component, localization_image_component])
270
+
271
+ gr.Examples(
272
+ examples=[[example_1_image, example_1_task], [example_2_image, example_2_task]],
273
+ inputs=[input_image_component, task_component],
274
+ outputs=[output_coords_component, localization_image_component],
275
+ fn=navigate,
276
+ cache_examples=True,
277
+ )
278
+
279
+ demo.queue(api_open=False)
280
+ demo.launch(debug=True)
assets/google.png ADDED
prompt.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ OS_ACTIONS = """
2
+ def final_answer(answer: any) -> any:
3
+ \"\"\"
4
+ Provides a final answer to the given problem.
5
+ Args:
6
+ answer: The final answer to the problem
7
+ \"\"\"
8
+
9
+ def move_mouse(self, x: float, y: float) -> str:
10
+ \"\"\"
11
+ Moves the mouse cursor to the specified coordinates
12
+ Args:
13
+ x: The x coordinate (horizontal position)
14
+ y: The y coordinate (vertical position)
15
+ \"\"\"
16
+
17
+ def click(x: Optional[float] = None, y: Optional[float] = None) -> str:
18
+ \"\"\"
19
+ Performs a left-click at the specified normalized coordinates
20
+ Args:
21
+ x: The x coordinate (horizontal position)
22
+ y: The y coordinate (vertical position)
23
+ \"\"\"
24
+
25
+ def double_click(x: Optional[float] = None, y: Optional[float] = None) -> str:
26
+ \"\"\"
27
+ Performs a double-click at the specified normalized coordinates
28
+ Args:
29
+ x: The x coordinate (horizontal position)
30
+ y: The y coordinate (vertical position)
31
+ \"\"\"
32
+
33
+ def type(text: str) -> str:
34
+ \"\"\"
35
+ Types the specified text at the current cursor position.
36
+ Args:
37
+ text: The text to type
38
+ \"\"\"
39
+
40
+ def press(keys: str | list[str]) -> str:
41
+ \"\"\"
42
+ Presses a keyboard key
43
+ Args:
44
+ keys: The key or list of keys to press (e.g. "enter", "space", "backspace", "ctrl", etc.).
45
+ \"\"\"
46
+
47
+ def navigate_back() -> str:
48
+ \"\"\"
49
+ Goes back to the previous page in the browser. If using this tool doesn't work, just click the button directly.
50
+ \"\"\"
51
+
52
+ def drag(from_coord: list[float], to_coord: list[float]) -> str:
53
+ \"\"\"
54
+ Clicks [x1, y1], drags mouse to [x2, y2], then release click.
55
+ Args:
56
+ x1: origin x coordinate
57
+ y1: origin y coordinate
58
+ x2: end x coordinate
59
+ y2: end y coordinate
60
+ \"\"\"
61
+
62
+ def scroll(direction: Literal["up", "down"] = "down", amount: int = 1) -> str:
63
+ \"\"\"
64
+ Moves the mouse to selected coordinates, then uses the scroll button: this could scroll the page or zoom, depending on the app. DO NOT use scroll to move through linux desktop menus.
65
+ Args:
66
+ x: The x coordinate (horizontal position) of the element to scroll/zoom, defaults to None to not focus on specific coordinates
67
+ y: The y coordinate (vertical position) of the element to scroll/zoom, defaults to None to not focus on specific coordinates
68
+ direction: The direction to scroll ("up" or "down"), defaults to "down". For zoom, "up" zooms in, "down" zooms out.
69
+ amount: The amount to scroll. A good amount is 1 or 2.
70
+ \"\"\"
71
+
72
+ def wait(seconds: float) -> str:
73
+ \"\"\"
74
+ Waits for the specified number of seconds. Very useful in case the prior order is still executing (for example starting very heavy applications like browsers or office apps)
75
+ Args:
76
+ seconds: Number of seconds to wait, generally 2 is enough.
77
+ \"\"\"
78
+ """
79
+
80
+ MOBILE_ACTIONS = """
81
+ def navigate_back() -> str:
82
+ \"\"\"
83
+ Return to home page
84
+ \"\"\"
85
+
86
+ def open_app(app_name: str) -> str:
87
+ \"\"\"
88
+ Launches the specified application.
89
+ Args:
90
+ app_name: the name of the application to launch
91
+ \"\"\"
92
+
93
+ def swipe(from_coord: list[str], to_coord: list[str]) -> str:
94
+ \"\"\"
95
+ swipe from 'from_coord' to 'to_coord'
96
+ Args:
97
+ from_coord: origin coordinates
98
+ to_coord: end coordinates
99
+ \"\"\"
100
+
101
+ def long_press(x: int, y: int) -> str:
102
+ \"\"\"
103
+ Performs a long-press at the specified coordinates
104
+ Args:
105
+ x: The x coordinate (horizontal position)
106
+ y: The y coordinate (vertical position)
107
+ \"\"\"
108
+ """
109
+
110
+ OS_SYSTEM_PROMPT = f"""You are a helpful GUI agent. You’ll be given a task and a screenshot of the screen. Complete the task using Python function calls.
111
+
112
+ For each step:
113
+ • First, <think></think> to express the thought process guiding your next action and the reasoning behind it.
114
+ • Then, use <code></code> to perform the action. it will be executed in a stateful environment.
115
+
116
+ The following functions are exposed to the Python interpreter:
117
+ <code>
118
+ {OS_ACTIONS}
119
+ </code>
120
+
121
+ The state persists between code executions: so if in one step you've created variables or imported modules, these will all persist.
122
+ """
123
+
124
+ MOBILE_SYSTEM_PROMPT = f"""You are a helpful GUI agent. You’ll be given a task and a screenshot of the screen. Complete the task using Python function calls.
125
+
126
+ For each step:
127
+ • First, <think></think> to express the thought process guiding your next action and the reasoning behind it.
128
+ • Then, use <code></code> to perform the action. it will be executed in a stateful environment.
129
+
130
+ The following functions are exposed to the Python interpreter:
131
+ <code>
132
+
133
+ # OS ACTIONS
134
+
135
+ {OS_ACTIONS}
136
+
137
+ # MOBILE ACTIONS
138
+
139
+ {MOBILE_ACTIONS}
140
+ </code>
141
+
142
+ The state persists between code executions: so if in one step you've created variables or imported modules, these will all persist.
143
+ """
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ numpy==2.3.3
2
+ Pillow==11.3.0
3
+ torch==2.8.0
4
+ torchvision==0.23.0
5
+ gradio==5.46.0
6
+ num2words==0.5.14
7
+ transformers==4.56.1
8
+ spaces==0.41.0
smolvlm_inference.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import AutoModelForImageTextToText, AutoProcessor
3
+
4
+
5
+ class TransformersModel:
6
+ def __init__(self, model_id: str, to_device: str = "cuda"):
7
+ self.model_id = model_id
8
+ self.processor = AutoProcessor.from_pretrained(model_id)
9
+ self.processor.image_processor.size = {"longest_edge": 3 * 384}
10
+ self.model = AutoModelForImageTextToText.from_pretrained(model_id, torch_dtype=torch.bfloat16).to(to_device)
11
+
12
+ def generate(self, messages: list[dict], **kwargs):
13
+ inputs = self.processor.apply_chat_template(
14
+ messages,
15
+ add_generation_prompt=True,
16
+ tokenize=True,
17
+ return_dict=True,
18
+ return_tensors="pt",
19
+ ).to(self.model.device, dtype=torch.bfloat16)
20
+ generated_ids = self.model.generate(**inputs, **kwargs)
21
+ return self.processor.batch_decode(
22
+ generated_ids[:, len(inputs["input_ids"][0]) :], skip_special_tokens=True
23
+ )[0]