m-ric HF Staff commited on
Commit
d2a16a2
·
1 Parent(s): 8772a92

Prepare 2-step crosshair

Browse files
Files changed (2) hide show
  1. app.py +0 -1
  2. e2bqwen.py +25 -18
app.py CHANGED
@@ -392,7 +392,6 @@ def get_or_create_sandbox(session_hash):
392
  'created_at': current_time,
393
  'last_accessed': current_time
394
  }
395
-
396
  return desktop
397
 
398
  def update_html(interactive_mode: bool, request: gr.Request):
 
392
  'created_at': current_time,
393
  'last_accessed': current_time
394
  }
 
395
  return desktop
396
 
397
  def update_html(interactive_mode: bool, request: gr.Request):
e2bqwen.py CHANGED
@@ -89,6 +89,16 @@ Use click to move through menus on the desktop and scroll for web and specific a
89
  REMEMBER TO ALWAYS CLICK IN THE MIDDLE OF THE TEXT, NOT ON THE SIDE, NOT UNDER.
90
  """
91
 
 
 
 
 
 
 
 
 
 
 
92
  class E2BVisionAgent(CodeAgent):
93
  """Agent for e2b desktop automation with Qwen2.5VL vision capabilities"""
94
  def __init__(
@@ -148,7 +158,8 @@ class E2BVisionAgent(CodeAgent):
148
  self.desktop.move_mouse(x, y)
149
  self.desktop.left_click()
150
  self.logger.log(f"Clicked at coordinates ({x}, {y})")
151
- self.click_coordinates = [x, y]
 
152
  return f"Clicked at coordinates ({x}, {y})"
153
 
154
  @tool
@@ -162,7 +173,7 @@ class E2BVisionAgent(CodeAgent):
162
  self.desktop.move_mouse(x, y)
163
  self.desktop.right_click()
164
  self.logger.log(f"Right-clicked at coordinates ({x}, {y})")
165
- self.click_coordinates = [x, y]
166
  return f"Right-clicked at coordinates ({x}, {y})"
167
 
168
  @tool
@@ -176,7 +187,7 @@ class E2BVisionAgent(CodeAgent):
176
  self.desktop.move_mouse(x, y)
177
  self.desktop.double_click()
178
  self.logger.log(f"Double-clicked at coordinates ({x}, {y})")
179
- self.click_coordinates = [x, y]
180
  return f"Double-clicked at coordinates ({x}, {y})"
181
 
182
  @tool
@@ -304,17 +315,6 @@ class E2BVisionAgent(CodeAgent):
304
  screenshot_bytes = self.desktop.screenshot()
305
  image = Image.open(BytesIO(screenshot_bytes))
306
 
307
- if getattr(self, "click_coordinates", None):
308
- # If a click was performed in the last action, mark it on the image
309
- x, y = self.click_coordinates
310
- draw = ImageDraw.Draw(image)
311
- cross_size, linewidth = 10, 3
312
- # Draw red cross lines
313
- draw.line((x - cross_size, y, x + cross_size, y), fill="red", width=linewidth)
314
- draw.line((x, y - cross_size, x, y + cross_size), fill="red", width=linewidth)
315
- # Add a circle around it for better visibility
316
- draw.ellipse((x - cross_size * 2, y - cross_size * 2, x + cross_size * 2, y + cross_size * 2), outline="red", width=linewidth)
317
-
318
  # Create a filename with step number
319
  screenshot_path = os.path.join(self.data_dir, f"step_{current_step:03d}.png")
320
  image.save(screenshot_path)
@@ -329,14 +329,22 @@ class E2BVisionAgent(CodeAgent):
329
  and previous_memory_step.step_number <= current_step - 2
330
  ):
331
  previous_memory_step.observations_images = None
 
 
 
 
 
 
 
 
 
 
332
 
333
  # Add to the current memory step
334
  memory_step.observations_images = [image.copy()]
335
 
336
  # memory_step.observations_images = [screenshot_path] # IF YOU USE THIS INSTEAD OF ABOVE, LAUNCHING A SECOND TASK BREAKS
337
 
338
- self.click_coordinates = None
339
-
340
 
341
  def close(self):
342
  """Clean up resources"""
@@ -358,8 +366,7 @@ class QwenVLAPIModel(Model):
358
  super().__init__()
359
  self.model_id = model_id
360
  self.base_model = HfApiModel(
361
- model_id,
362
- provider="hyperbolic",
363
  token=hf_token,
364
  )
365
  self.fallback_model = HfApiModel(
 
89
  REMEMBER TO ALWAYS CLICK IN THE MIDDLE OF THE TEXT, NOT ON THE SIDE, NOT UNDER.
90
  """
91
 
92
+ def draw_marker_on_image(image, click_coordinates):
93
+ x, y = click_coordinates
94
+ draw = ImageDraw.Draw(image)
95
+ cross_size, linewidth = 10, 3
96
+ # Draw red cross lines
97
+ draw.line((x - cross_size, y, x + cross_size, y), fill="red", width=linewidth)
98
+ draw.line((x, y - cross_size, x, y + cross_size), fill="red", width=linewidth)
99
+ # Add a circle around it for better visibility
100
+ draw.ellipse((x - cross_size * 2, y - cross_size * 2, x + cross_size * 2, y + cross_size * 2), outline="red", width=linewidth)
101
+
102
  class E2BVisionAgent(CodeAgent):
103
  """Agent for e2b desktop automation with Qwen2.5VL vision capabilities"""
104
  def __init__(
 
158
  self.desktop.move_mouse(x, y)
159
  self.desktop.left_click()
160
  self.logger.log(f"Clicked at coordinates ({x}, {y})")
161
+ self.memory.steps[-1].click_coordinates = [x, y]
162
+ print("FLAGG", self.memory.steps[-1])
163
  return f"Clicked at coordinates ({x}, {y})"
164
 
165
  @tool
 
173
  self.desktop.move_mouse(x, y)
174
  self.desktop.right_click()
175
  self.logger.log(f"Right-clicked at coordinates ({x}, {y})")
176
+ self.memory.steps[-1].click_coordinates = [x, y]
177
  return f"Right-clicked at coordinates ({x}, {y})"
178
 
179
  @tool
 
187
  self.desktop.move_mouse(x, y)
188
  self.desktop.double_click()
189
  self.logger.log(f"Double-clicked at coordinates ({x}, {y})")
190
+ self.memory.steps[-1].click_coordinates = [x, y]
191
  return f"Double-clicked at coordinates ({x}, {y})"
192
 
193
  @tool
 
315
  screenshot_bytes = self.desktop.screenshot()
316
  image = Image.open(BytesIO(screenshot_bytes))
317
 
 
 
 
 
 
 
 
 
 
 
 
318
  # Create a filename with step number
319
  screenshot_path = os.path.join(self.data_dir, f"step_{current_step:03d}.png")
320
  image.save(screenshot_path)
 
329
  and previous_memory_step.step_number <= current_step - 2
330
  ):
331
  previous_memory_step.observations_images = None
332
+ if (
333
+ isinstance(previous_memory_step, ActionStep)
334
+ and previous_memory_step.step_number == current_step - 1
335
+ and hasattr(memory_step, "click_coordinates")
336
+ ):
337
+ print("Drawing cross on previous step image")
338
+ draw_marker_on_image(previous_memory_step.observations_images[0], memory_step.click_coordinates)
339
+
340
+ if hasattr(memory_step, "click_coordinates"):
341
+ draw_marker_on_image(image, memory_step.click_coordinates)
342
 
343
  # Add to the current memory step
344
  memory_step.observations_images = [image.copy()]
345
 
346
  # memory_step.observations_images = [screenshot_path] # IF YOU USE THIS INSTEAD OF ABOVE, LAUNCHING A SECOND TASK BREAKS
347
 
 
 
348
 
349
  def close(self):
350
  """Clean up resources"""
 
366
  super().__init__()
367
  self.model_id = model_id
368
  self.base_model = HfApiModel(
369
+ model_id="https://n5wr7lfx6wp94tvl.us-east-1.aws.endpoints.huggingface.cloud",
 
370
  token=hf_token,
371
  )
372
  self.fallback_model = HfApiModel(