m-ric HF Staff commited on
Commit
674cae2
·
1 Parent(s): 4e27b40

Adjust resolution

Browse files
Files changed (2) hide show
  1. app.py +4 -4
  2. e2bqwen.py +17 -21
app.py CHANGED
@@ -20,8 +20,8 @@ E2B_API_KEY = os.getenv("E2B_API_KEY")
20
  SANDBOXES = {}
21
  SANDBOX_METADATA = {}
22
  SANDBOX_TIMEOUT = 600
23
- WIDTH = 1280
24
- HEIGHT = 960
25
  TMP_DIR = './tmp/'
26
  if not os.path.exists(TMP_DIR):
27
  os.makedirs(TMP_DIR)
@@ -528,7 +528,7 @@ class EnrichedGradioUI(GradioUI):
528
  if hasattr(session_state["agent"], "last_screenshot") and msg.content == "-----": # Append the last screenshot before the end of step
529
  stored_messages.append(gr.ChatMessage(
530
  role="assistant",
531
- content={"path": session_state["agent"].last_screenshot.to_string(), "mime_type": "image/png"},
532
  ))
533
  stored_messages.append(msg)
534
  yield stored_messages
@@ -619,7 +619,7 @@ with gr.Blocks(theme=theme, css=custom_css, js=custom_js) as demo:
619
  left: 110px;
620
  }
621
  .sandbox-iframe {
622
- transform: scale(0.535);
623
  /* transform: scale(0.59); */
624
  }
625
 
 
20
  SANDBOXES = {}
21
  SANDBOX_METADATA = {}
22
  SANDBOX_TIMEOUT = 600
23
+ WIDTH = 1024
24
+ HEIGHT = 768
25
  TMP_DIR = './tmp/'
26
  if not os.path.exists(TMP_DIR):
27
  os.makedirs(TMP_DIR)
 
528
  if hasattr(session_state["agent"], "last_screenshot") and msg.content == "-----": # Append the last screenshot before the end of step
529
  stored_messages.append(gr.ChatMessage(
530
  role="assistant",
531
+ content={"path": session_state["agent"].last_marked_screenshot.to_string(), "mime_type": "image/png"},
532
  ))
533
  stored_messages.append(msg)
534
  yield stored_messages
 
619
  left: 110px;
620
  }
621
  .sandbox-iframe {
622
+ transform: scale(0.667);
623
  /* transform: scale(0.59); */
624
  }
625
 
e2bqwen.py CHANGED
@@ -138,27 +138,27 @@ Whenever you click, MAKE SURE to click in the middle of the button, text, link o
138
  </click_guidelines>
139
 
140
  <general_guidelines>
 
141
  You can wait for appropriate loading times using the wait() tool. But don't wait forever, sometimes you've just misclicked and the process didn't launch.
142
- Use precise coordinates based on the current screenshot. Don't do hypothesis or guessing: USE TRUE COORDINATES.
143
  Execute one action at a time: don't try to pack a click and typing in one action.
144
  On each step, look at the last screenshot and action to validate if previous steps worked and decide the next action. If you repeated an action already without effect, it means that this action is useless: don't repeat it and try something else.
145
  Use click to move through menus on the desktop and scroll for web and specific applications.
146
  Always analyze the latest screenshot carefully before performing actions. Make sure to:
147
  To navigate the desktop you should open menus and click. Menus usually expand with more options, the tiny triangle next to some text in a menu means that menu expands. For example in Office in the Applications menu expands showing presentation or writing applications.
148
- Always analyze the latest screenshot carefully before performing actions.
149
- The desktop has a resolution of <<resolution_x>>x<<resolution_y>> pixels.
150
  </general_guidelines>
151
  """
152
 
153
- def draw_marker_on_image(image, click_coordinates):
154
  x, y = click_coordinates
155
- draw = ImageDraw.Draw(image)
156
  cross_size, linewidth = 10, 3
157
  # Draw red cross lines
158
  draw.line((x - cross_size, y, x + cross_size, y), fill="red", width=linewidth)
159
  draw.line((x, y - cross_size, x, y + cross_size), fill="red", width=linewidth)
160
  # Add a circle around it for better visibility
161
  draw.ellipse((x - cross_size * 2, y - cross_size * 2, x + cross_size * 2, y + cross_size * 2), outline="red", width=linewidth)
 
162
 
163
  class E2BVisionAgent(CodeAgent):
164
  """Agent for e2b desktop automation with Qwen2.5VL vision capabilities"""
@@ -194,7 +194,7 @@ class E2BVisionAgent(CodeAgent):
194
  **kwargs
195
  )
196
  self.prompt_templates["system_prompt"] = E2B_SYSTEM_PROMPT_TEMPLATE.replace("<<resolution_x>>", str(self.width)).replace("<<resolution_y>>", str(self.height))
197
-
198
 
199
  # Add screen info to state
200
  self.state["screen_width"] = self.width
@@ -396,25 +396,21 @@ class E2BVisionAgent(CodeAgent):
396
 
397
  current_step = memory_step.step_number
398
 
399
- time.sleep(2.0) # Let things happen on the desktop
400
  screenshot_bytes = self.desktop.screenshot(format="bytes")
401
  image = Image.open(BytesIO(screenshot_bytes))
402
 
403
- if getattr(self, "click_coordinates", None):
404
- # If a click was performed in the last action, mark it on the image
405
- x, y = self.click_coordinates
406
- draw = ImageDraw.Draw(image)
407
- cross_size, linewidth = 10, 3
408
- # Draw red cross lines
409
- draw.line((x - cross_size, y, x + cross_size, y), fill="red", width=linewidth)
410
- draw.line((x, y - cross_size, x, y + cross_size), fill="red", width=linewidth)
411
- # Add a circle around it for better visibility
412
- draw.ellipse((x - cross_size * 2, y - cross_size * 2, x + cross_size * 2, y + cross_size * 2), outline="red", width=linewidth)
413
-
414
  # Create a filename with step number
415
  screenshot_path = os.path.join(self.data_dir, f"step_{current_step:03d}.png")
416
  image.save(screenshot_path)
417
- self.last_screenshot = AgentImage(screenshot_path)
 
 
 
 
 
 
 
418
  print(f"Saved screenshot for step {current_step} to {screenshot_path}")
419
 
420
  for (
@@ -433,8 +429,8 @@ class E2BVisionAgent(CodeAgent):
433
  if previous_memory_step.tool_calls[0].arguments == memory_step.tool_calls[0].arguments:
434
  memory_step.observations += "\nWARNING: You've executed the same action several times in a row. MAKE SURE TO NOT UNNECESSARILY REPEAT ACTIONS."
435
 
436
- # Add to the current memory step
437
- memory_step.observations_images = [image.copy()]
438
 
439
  # memory_step.observations_images = [screenshot_path] # IF YOU USE THIS INSTEAD OF ABOVE, LAUNCHING A SECOND TASK BREAKS
440
 
 
138
  </click_guidelines>
139
 
140
  <general_guidelines>
141
+ Always analyze the latest screenshot carefully before performing actions.
142
  You can wait for appropriate loading times using the wait() tool. But don't wait forever, sometimes you've just misclicked and the process didn't launch.
143
+ Use precise coordinates based on the current screenshot. The desktop has a resolution of <<resolution_x>>x<<resolution_y>> pixels: NEVER USE HYPOTHETIC COORDINATES, USE TRUE COORDINATES that you can see from the screenshot.
144
  Execute one action at a time: don't try to pack a click and typing in one action.
145
  On each step, look at the last screenshot and action to validate if previous steps worked and decide the next action. If you repeated an action already without effect, it means that this action is useless: don't repeat it and try something else.
146
  Use click to move through menus on the desktop and scroll for web and specific applications.
147
  Always analyze the latest screenshot carefully before performing actions. Make sure to:
148
  To navigate the desktop you should open menus and click. Menus usually expand with more options, the tiny triangle next to some text in a menu means that menu expands. For example in Office in the Applications menu expands showing presentation or writing applications.
 
 
149
  </general_guidelines>
150
  """
151
 
152
+ def draw_marker_on_image(image_copy, click_coordinates):
153
  x, y = click_coordinates
154
+ draw = ImageDraw.Draw(image_copy)
155
  cross_size, linewidth = 10, 3
156
  # Draw red cross lines
157
  draw.line((x - cross_size, y, x + cross_size, y), fill="red", width=linewidth)
158
  draw.line((x, y - cross_size, x, y + cross_size), fill="red", width=linewidth)
159
  # Add a circle around it for better visibility
160
  draw.ellipse((x - cross_size * 2, y - cross_size * 2, x + cross_size * 2, y + cross_size * 2), outline="red", width=linewidth)
161
+ return image_copy
162
 
163
  class E2BVisionAgent(CodeAgent):
164
  """Agent for e2b desktop automation with Qwen2.5VL vision capabilities"""
 
194
  **kwargs
195
  )
196
  self.prompt_templates["system_prompt"] = E2B_SYSTEM_PROMPT_TEMPLATE.replace("<<resolution_x>>", str(self.width)).replace("<<resolution_y>>", str(self.height))
197
+ print("PROMPT TEMPLATE:", self.prompt_templates["system_prompt"])
198
 
199
  # Add screen info to state
200
  self.state["screen_width"] = self.width
 
396
 
397
  current_step = memory_step.step_number
398
 
399
+ time.sleep(3.0) # Let things happen on the desktop
400
  screenshot_bytes = self.desktop.screenshot(format="bytes")
401
  image = Image.open(BytesIO(screenshot_bytes))
402
 
 
 
 
 
 
 
 
 
 
 
 
403
  # Create a filename with step number
404
  screenshot_path = os.path.join(self.data_dir, f"step_{current_step:03d}.png")
405
  image.save(screenshot_path)
406
+
407
+ image_copy = image.copy()
408
+
409
+ if getattr(self, "click_coordinates", None):
410
+ print("DRAWING MARKER")
411
+ image_copy = draw_marker_on_image(image_copy, self.click_coordinates)
412
+
413
+ self.last_marked_screenshot = AgentImage(screenshot_path)
414
  print(f"Saved screenshot for step {current_step} to {screenshot_path}")
415
 
416
  for (
 
429
  if previous_memory_step.tool_calls[0].arguments == memory_step.tool_calls[0].arguments:
430
  memory_step.observations += "\nWARNING: You've executed the same action several times in a row. MAKE SURE TO NOT UNNECESSARILY REPEAT ACTIONS."
431
 
432
+ # Add the marker-edited image to the current memory step
433
+ memory_step.observations_images = [image_copy]
434
 
435
  # memory_step.observations_images = [screenshot_path] # IF YOU USE THIS INSTEAD OF ABOVE, LAUNCHING A SECOND TASK BREAKS
436