m-ric HF Staff commited on
Commit
5cc8c9a
·
1 Parent(s): e781603

AB testing with older version

Browse files
Files changed (3) hide show
  1. app.py +7 -2
  2. e2bqwen.py +268 -37
  3. eval.py +18 -10
app.py CHANGED
@@ -10,7 +10,7 @@ from threading import Timer
10
  from huggingface_hub import upload_folder, login
11
  from e2b_desktop import Sandbox
12
 
13
- from smolagents import CodeAgent
14
  from smolagents.monitoring import LogLevel
15
  from smolagents.gradio_ui import GradioUI, stream_to_gradio
16
  from model_replay import FakeModelReplayLog
@@ -471,6 +471,10 @@ def create_agent(data_dir, desktop):
471
  model_id="Qwen/Qwen2.5-VL-72B-Instruct",
472
  hf_token = hf_token,
473
  )
 
 
 
 
474
  return E2BVisionAgent(
475
  model=model,
476
  data_dir=data_dir,
@@ -478,6 +482,7 @@ def create_agent(data_dir, desktop):
478
  max_steps=200,
479
  verbosity_level=2,
480
  planning_interval=10,
 
481
  )
482
 
483
  def get_agent_summary_erase_images(agent):
@@ -581,7 +586,7 @@ _Please note that we store the task logs by default so **do not write any person
581
  "When was Temple Grandin introduced to the American Academy of Arts and Sciences, according to Wikipedia?",
582
  "Search a flight Rome - Berlin for tomorrow",
583
  "What' s the name of the pond just south of Château de Fontainebleau in Google maps?",
584
- "Go generate a picture of the Golden Gate bridge on a FLUX1.dev space",
585
  "Download me a picture of a puppy from Google, then head to Hugging Face, find a Space dedicated to background removal, and use it to remove the puppy picture's background",
586
  ],
587
  inputs = task_input,
 
10
  from huggingface_hub import upload_folder, login
11
  from e2b_desktop import Sandbox
12
 
13
+ from smolagents import CodeAgent, OpenAIServerModel
14
  from smolagents.monitoring import LogLevel
15
  from smolagents.gradio_ui import GradioUI, stream_to_gradio
16
  from model_replay import FakeModelReplayLog
 
471
  model_id="Qwen/Qwen2.5-VL-72B-Instruct",
472
  hf_token = hf_token,
473
  )
474
+
475
+ model = OpenAIServerModel(
476
+ "gpt-4o",api_key=os.getenv("OPENAI_API_KEY")
477
+ )
478
  return E2BVisionAgent(
479
  model=model,
480
  data_dir=data_dir,
 
482
  max_steps=200,
483
  verbosity_level=2,
484
  planning_interval=10,
485
+ use_v1_prompt=True
486
  )
487
 
488
  def get_agent_summary_erase_images(agent):
 
586
  "When was Temple Grandin introduced to the American Academy of Arts and Sciences, according to Wikipedia?",
587
  "Search a flight Rome - Berlin for tomorrow",
588
  "What' s the name of the pond just south of Château de Fontainebleau in Google maps?",
589
+ "Go on the Hugging Face Hub, find the space for FLUX1.dev, then generate a picture of the Golden Gate bridge",
590
  "Download me a picture of a puppy from Google, then head to Hugging Face, find a Space dedicated to background removal, and use it to remove the puppy picture's background",
591
  ],
592
  inputs = task_input,
e2bqwen.py CHANGED
@@ -180,6 +180,7 @@ class E2BVisionAgent(CodeAgent):
180
  max_steps: int = 200,
181
  verbosity_level: LogLevel = 2,
182
  planning_interval: int = 10,
 
183
  **kwargs
184
  ):
185
  self.desktop = desktop
@@ -193,6 +194,7 @@ class E2BVisionAgent(CodeAgent):
193
  os.makedirs(self.data_dir, exist_ok=True)
194
  print(f"Screenshots and steps will be saved to: {self.data_dir}")
195
 
 
196
  # Initialize base agent
197
  super().__init__(
198
  tools=tools or [],
@@ -208,28 +210,90 @@ class E2BVisionAgent(CodeAgent):
208
  self.state["screen_width"] = self.width
209
  self.state["screen_height"] = self.height
210
 
211
-
212
  # Add default tools
213
  self.logger.log("Setting up agent tools...")
214
  self._setup_desktop_tools()
215
  self.step_callbacks.append(self.take_screenshot_callback)
216
 
217
  def initialize_system_prompt(self) -> str:
218
- system_prompt = populate_template(
219
- self.prompt_templates["system_prompt"],
220
- variables={
221
- "tools": self.tools,
222
- "managed_agents": self.managed_agents,
223
- "authorized_imports": (
224
- "You can import from any package you want."
225
- if "*" in self.authorized_imports
226
- else str(self.authorized_imports)
227
- ),
228
- },
229
- )
230
- assert system_prompt != self.prompt_templates["system_prompt"], "Populating prompt template failed"
231
- print("FINAL PROMPT:", system_prompt)
232
- return system_prompt
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
233
 
234
  def _setup_desktop_tools(self):
235
  """Register all desktop tools"""
@@ -471,44 +535,211 @@ class E2BVisionAgent(CodeAgent):
471
  print("E2B sandbox terminated")
472
 
473
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
474
  class QwenVLAPIModel(Model):
475
  """Model wrapper for Qwen2.5VL API with fallback mechanism"""
476
 
477
  def __init__(
478
  self,
479
- model_id: str = "Qwen/Qwen2.5-VL-72B-Instruct",
 
480
  hf_token: str = None,
 
 
 
 
481
  ):
482
  super().__init__()
483
- self.model_id = model_id
484
- self.base_model = HfApiModel(
485
- model_id="https://n5wr7lfx6wp94tvl.us-east-1.aws.endpoints.huggingface.cloud",
486
- token=hf_token,
487
- max_tokens=4096,
488
- )
489
- self.fallback_model = HfApiModel(
490
- model_id,
491
- provider="nebius",
492
- token=hf_token,
493
- max_tokens=4096,
494
  )
495
 
 
 
 
 
 
 
 
 
 
496
  def __call__(
497
  self,
498
  messages: List[Dict[str, Any]],
499
  stop_sequences: Optional[List[str]] = None,
500
  **kwargs
501
  ) -> ChatMessage:
 
 
 
 
502
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
503
  try:
504
- message = self.base_model(messages, stop_sequences, **kwargs)
505
- return message
506
- except Exception as e:
507
- print(f"Base model failed with error: {e}. Calling fallback model.")
508
-
509
- # Continue to fallback
510
- try:
511
- message = self.fallback_model(messages, stop_sequences, **kwargs)
512
- return message
513
  except Exception as e:
514
  raise Exception(f"Both endpoints failed. Last error: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
  max_steps: int = 200,
181
  verbosity_level: LogLevel = 2,
182
  planning_interval: int = 10,
183
+ use_v1_prompt: bool = False,
184
  **kwargs
185
  ):
186
  self.desktop = desktop
 
194
  os.makedirs(self.data_dir, exist_ok=True)
195
  print(f"Screenshots and steps will be saved to: {self.data_dir}")
196
 
197
+ self.use_v1_prompt = use_v1_prompt
198
  # Initialize base agent
199
  super().__init__(
200
  tools=tools or [],
 
210
  self.state["screen_width"] = self.width
211
  self.state["screen_height"] = self.height
212
 
 
213
  # Add default tools
214
  self.logger.log("Setting up agent tools...")
215
  self._setup_desktop_tools()
216
  self.step_callbacks.append(self.take_screenshot_callback)
217
 
218
  def initialize_system_prompt(self) -> str:
219
+ if self.use_v1_prompt:
220
+ return """You are a desktop automation assistant that can control a remote desktop environment.
221
+ You only have access to the following tools to interact with the desktop, no additional ones:
222
+ - click(x, y): Performs a left-click at the specified coordinates
223
+ - right_click(x, y): Performs a right-click at the specified coordinates
224
+ - double_click(x, y): Performs a double-click at the specified coordinates
225
+ - move_mouse(x, y): Moves the mouse cursor to the specified coordinates
226
+ - type_text(text): Types the specified text at the current cursor position
227
+ - press_key(key): Presses a keyboard key (e.g., "Return", "tab", "ctrl+c")
228
+ - scroll(x, y, direction, amount): Scrolls a website in a browser or a document (direction can be "up" or "down", a common amount is 1 or 2 scroll("down",1) ). DO NOT use scroll to move through linux desktop menus. x, y, is the mouse position to scroll on.
229
+ - wait(seconds): Waits for the specified number of seconds. Very useful in case the prior order is still executing (for example starting very heavy applications like browsers or office apps)
230
+ - open_url(url): Directly opens a browser with the specified url, saves time compared to clicking in a browser and going through the initial setup wizard.
231
+ - final_answer("YOUR FINAL ANSWER TEXT"): Announces that the task requested is completed and provides a final text
232
+ The desktop has a resolution of {resolution_x}x{resolution_y}.
233
+ IMPORTANT:
234
+ - Remember the tools that you have as those can save you time, for example open_url to enter a website rather than searching for the browser in the OS.
235
+ - Whenever you click, MAKE SURE to click in the middle of the button, text, link or any other clickable element. Not under, not on the side. IN THE MIDDLE. In menus it is always better to click in the middle of the text rather than in the tiny icon. Calculate extremelly well the coordinates. A mistake here can make the full task fail.
236
+ - To navigate the desktop you should open menus and click. Menus usually expand with more options, the tiny triangle next to some text in a menu means that menu expands. For example in Office in the Applications menu expands showing presentation or writing applications.
237
+ - Always analyze the latest screenshot carefully before performing actions. If you clicked somewhere in the previous action and in the screenshot nothing happened, make sure the mouse is where it should be. Otherwise you can see that the coordinates were wrong.
238
+ You must proceed step by step:
239
+ 1. Understand the task thoroughly
240
+ 2. Break down the task into logical steps
241
+ 3. For each step:
242
+ a. Analyze the current screenshot to identify UI elements
243
+ b. Plan the appropriate action with precise coordinates
244
+ c. Execute ONE action at a time using the proper tool
245
+ d. Wait for the action to complete before proceeding
246
+ After each action, you'll receive an updated screenshot. Review it carefully before your next action.
247
+ COMMAND FORMAT:
248
+ Always format your actions as Python code blocks. For example:
249
+ ```python
250
+ click(250, 300)
251
+ ```<end_code>
252
+ TASK EXAMPLE:
253
+ For a task like "Open a text editor and type 'Hello World'":
254
+ 1- First, analyze the screenshot to find the Applications menu and click on it being very precise, clicking in the middle of the text 'Applications':
255
+ ```python
256
+ click(50, 10)
257
+ ```<end_code>
258
+ 2- Remembering that menus are navigated through clicking, after analyzing the screenshot with the applications menu open we see that a notes application probably fits in the Accessories section (we see it is a section in the menu thanks to the tiny white triangle after the text accessories). We look for Accessories and click on it being very precise, clicking in the middle of the text 'Accessories'. DO NOT try to move through the menus with scroll, it won't work:
259
+ ```python
260
+ click(76, 195)
261
+ ```<end_code>
262
+ 3- Remembering that menus are navigated through clicking, after analyzing the screenshot with the submenu Accessories open, look for 'Text Editor' and click on it being very precise, clicking in the middle of the text 'Text Editor':
263
+ ```python
264
+ click(241, 441)
265
+ ```<end_code>
266
+ 4- Once Notepad is open, type the requested text:
267
+ ```python
268
+ type_text("Hello World")
269
+ ```<end_code>
270
+ 5- Task is completed:
271
+ ```python
272
+ final_answer("Done")
273
+ ```<end_code>
274
+ Remember to:
275
+ Always wait for appropriate loading times
276
+ Use precise coordinates based on the current screenshot
277
+ Execute one action at a time
278
+ Verify the result before proceeding to the next step
279
+ Use click to move through menus on the desktop and scroll for web and specific applications.
280
+ REMEMBER TO ALWAYS CLICK IN THE MIDDLE OF THE TEXT, NOT ON THE SIDE, NOT UNDER.
281
+ """.format(resolution_x=self.width, resolution_y=self.height)
282
+ else:
283
+ system_prompt = populate_template(
284
+ self.prompt_templates["system_prompt"],
285
+ variables={
286
+ "tools": self.tools,
287
+ "managed_agents": self.managed_agents,
288
+ "authorized_imports": (
289
+ "You can import from any package you want."
290
+ if "*" in self.authorized_imports
291
+ else str(self.authorized_imports)
292
+ ),
293
+ },
294
+ )
295
+ assert system_prompt != self.prompt_templates["system_prompt"], "Populating prompt template failed"
296
+ return system_prompt
297
 
298
  def _setup_desktop_tools(self):
299
  """Register all desktop tools"""
 
535
  print("E2B sandbox terminated")
536
 
537
 
538
+ # class QwenVLAPIModel(Model):
539
+ # """Model wrapper for Qwen2.5VL API with fallback mechanism"""
540
+
541
+ # def __init__(
542
+ # self,
543
+ # model_id: str = "Qwen/Qwen2.5-VL-72B-Instruct",
544
+ # hf_token: str = None,
545
+ # ):
546
+ # super().__init__()
547
+ # self.model_id = model_id
548
+ # self.base_model = HfApiModel(
549
+ # model_id="https://n5wr7lfx6wp94tvl.us-east-1.aws.endpoints.huggingface.cloud",
550
+ # token=hf_token,
551
+ # max_tokens=4096,
552
+ # )
553
+ # self.fallback_model = HfApiModel(
554
+ # model_id,
555
+ # provider="nebius",
556
+ # token=hf_token,
557
+ # max_tokens=4096,
558
+ # )
559
+
560
+ # def __call__(
561
+ # self,
562
+ # messages: List[Dict[str, Any]],
563
+ # stop_sequences: Optional[List[str]] = None,
564
+ # **kwargs
565
+ # ) -> ChatMessage:
566
+
567
+ # try:
568
+ # message = self.base_model(messages, stop_sequences, **kwargs)
569
+ # return message
570
+ # except Exception as e:
571
+ # print(f"Base model failed with error: {e}. Calling fallback model.")
572
+
573
+ # # Continue to fallback
574
+ # try:
575
+ # message = self.fallback_model(messages, stop_sequences, **kwargs)
576
+ # return message
577
+ # except Exception as e:
578
+ # raise Exception(f"Both endpoints failed. Last error: {e}")
579
+
580
  class QwenVLAPIModel(Model):
581
  """Model wrapper for Qwen2.5VL API with fallback mechanism"""
582
 
583
  def __init__(
584
  self,
585
+ model_path: str = "Qwen/Qwen2.5-VL-72B-Instruct",
586
+ provider: str = "hyperbolic",
587
  hf_token: str = None,
588
+ #hf_base_url: str = "https://n5wr7lfx6wp94tvl.us-east-1.aws.endpoints.huggingface.cloud/v1/"
589
+ #hf_base_url: str = "https://s41ydkv0iyjeokyj.us-east-1.aws.endpoints.huggingface.cloud/v1/"
590
+ #hf_base_url: str = "https://n5wr7lfx6wp94tvl.us-east-1.aws.endpoints.huggingface.cloud/v1/"
591
+ hf_base_url: str= "https://n5wr7lfx6wp94tvl.us-east-1.aws.endpoints.huggingface.cloud/v1/"
592
  ):
593
  super().__init__()
594
+ self.model_path = model_path
595
+ self.model_id = model_path
596
+ self.provider = provider
597
+ self.hf_token = hf_token
598
+ self.hf_base_url = hf_base_url
599
+
600
+ # Initialize hyperbolic client
601
+ self.hyperbolic_client = InferenceClient(
602
+ provider=self.provider,
 
 
603
  )
604
 
605
+ # Initialize HF OpenAI-compatible client if token is provided
606
+ self.hf_client = None
607
+ if hf_token:
608
+ from openai import OpenAI
609
+ self.hf_client = OpenAI(
610
+ base_url=self.hf_base_url,
611
+ api_key=self.hf_token
612
+ )
613
+
614
  def __call__(
615
  self,
616
  messages: List[Dict[str, Any]],
617
  stop_sequences: Optional[List[str]] = None,
618
  **kwargs
619
  ) -> ChatMessage:
620
+ """Convert a list of messages to an API request with fallback mechanism"""
621
+ print(messages)
622
+ # Format messages once for both APIs
623
+ formatted_messages = self._format_messages(messages)
624
 
625
+ # First try the HF endpoint if available
626
+ if self.hf_client:
627
+ try:
628
+ completion = self._call_hf_endpoint(
629
+ formatted_messages,
630
+ stop_sequences,
631
+ **kwargs
632
+ )
633
+ return ChatMessage(role=MessageRole.ASSISTANT, content=completion)
634
+ except Exception as e:
635
+ print(f"HF endpoint failed with error: {e}. Falling back to hyperbolic.")
636
+ # Continue to fallback
637
+
638
+ # Fallback to hyperbolic
639
  try:
640
+ return self._call_hyperbolic(formatted_messages, stop_sequences, **kwargs)
 
 
 
 
 
 
 
 
641
  except Exception as e:
642
  raise Exception(f"Both endpoints failed. Last error: {e}")
643
+
644
+ def _format_messages(self, messages: List[Dict[str, Any]]):
645
+ """Format messages for API requests - works for both endpoints"""
646
+
647
+ formatted_messages = []
648
+
649
+ for msg in messages:
650
+ role = msg["role"]
651
+ content = []
652
+
653
+ if isinstance(msg["content"], list):
654
+ for item in msg["content"]:
655
+ if item["type"] == "text":
656
+ content.append({"type": "text", "text": item["text"]})
657
+ elif item["type"] == "image":
658
+ # Handle image path or direct image object
659
+ if isinstance(item["image"], str):
660
+ # Image is a path
661
+ with open(item["image"], "rb") as image_file:
662
+ base64_image = base64.b64encode(image_file.read()).decode("utf-8")
663
+ else:
664
+ # Image is a PIL image or similar object
665
+ img_byte_arr = io.BytesIO()
666
+ item["image"].save(img_byte_arr, format="PNG")
667
+ base64_image = base64.b64encode(img_byte_arr.getvalue()).decode("utf-8")
668
+
669
+ content.append({
670
+ "type": "image_url",
671
+ "image_url": {
672
+ "url": f"data:image/png;base64,{base64_image}"
673
+ }
674
+ })
675
+ else:
676
+ # Plain text message
677
+ content = [{"type": "text", "text": msg["content"]}]
678
+
679
+ formatted_messages.append({"role": role, "content": content})
680
+
681
+ return formatted_messages
682
+
683
+ def _call_hf_endpoint(self, formatted_messages, stop_sequences=None, **kwargs):
684
+ """Call the Hugging Face OpenAI-compatible endpoint"""
685
+
686
+ # Extract parameters with defaults
687
+ max_tokens = kwargs.get("max_new_tokens", 512)
688
+ temperature = kwargs.get("temperature", 0.7)
689
+ top_p = kwargs.get("top_p", 0.9)
690
+ stream = kwargs.get("stream", False)
691
+
692
+ completion = self.hf_client.chat.completions.create(
693
+ model="tgi", # Model name for the endpoint
694
+ messages=formatted_messages,
695
+ max_tokens=max_tokens,
696
+ temperature=temperature,
697
+ top_p=top_p,
698
+ stream=stream,
699
+ stop=stop_sequences
700
+ )
701
+
702
+ if stream:
703
+ # For streaming responses, return a generator
704
+ def stream_generator():
705
+ for chunk in completion:
706
+ yield chunk.choices[0].delta.content or ""
707
+ return stream_generator()
708
+ else:
709
+ # For non-streaming, return the full text
710
+ return completion.choices[0].message.content
711
+
712
+ def _call_hyperbolic(self, formatted_messages, stop_sequences=None, **kwargs):
713
+ """Call the hyperbolic API"""
714
+
715
+ completion = self.hyperbolic_client.chat.completions.create(
716
+ model=self.model_path,
717
+ messages=formatted_messages,
718
+ max_tokens=kwargs.get("max_new_tokens", 512),
719
+ temperature=kwargs.get("temperature", 0.7),
720
+ top_p=kwargs.get("top_p", 0.9),
721
+ )
722
+
723
+ # Extract the response text
724
+ output_text = completion.choices[0].message.content
725
+
726
+ return ChatMessage(role=MessageRole.ASSISTANT, content=output_text)
727
+
728
+ def to_dict(self) -> Dict[str, Any]:
729
+ """Convert the model to a dictionary"""
730
+ return {
731
+ "class": self.__class__.__name__,
732
+ "model_path": self.model_path,
733
+ "provider": self.provider,
734
+ "hf_base_url": self.hf_base_url,
735
+ # We don't save the API keys for security reasons
736
+ }
737
+
738
+ @classmethod
739
+ def from_dict(cls, data: Dict[str, Any]) -> "QwenVLAPIModel":
740
+ """Create a model from a dictionary"""
741
+ return cls(
742
+ model_path=data.get("model_path", "Qwen/Qwen2.5-VL-72B-Instruct"),
743
+ provider=data.get("provider", "hyperbolic"),
744
+ hf_base_url=data.get("hf_base_url", "https://n5wr7lfx6wp94tvl.us-east-1.aws.endpoints.huggingface.cloud/v1/"),
745
+ )
eval.py CHANGED
@@ -12,10 +12,13 @@ from threading import Timer
12
  from e2b_desktop import Sandbox
13
  from huggingface_hub import get_token
14
 
15
- from smolagents import CodeAgent
16
  from smolagents.monitoring import LogLevel
17
  from e2bqwen import QwenVLAPIModel, E2BVisionAgent
18
 
 
 
 
19
  # Environment variables and constants
20
  E2B_API_KEY = os.getenv("E2B_API_KEY")
21
  # Try to get token dynamically, fall back to environment variable
@@ -53,17 +56,21 @@ def get_git_hash():
53
  except:
54
  return "nogit"
55
 
56
- def create_agent(data_dir, desktop):
57
  """Create an agent with the E2B desktop sandbox"""
58
  model = QwenVLAPIModel(
59
  model_id="Qwen/Qwen2.5-VL-72B-Instruct",
60
  hf_token=HUGGINGFACE_API_KEY,
61
  )
 
 
 
 
62
  return E2BVisionAgent(
63
  model=model,
64
  data_dir=data_dir,
65
  desktop=desktop,
66
- max_steps=200,
67
  verbosity_level=2,
68
  planning_interval=10,
69
  )
@@ -109,7 +116,7 @@ def save_final_status(folder, status: str, summary, error_message=None) -> None:
109
  "error_message": error_message
110
  }, default=chat_message_to_json))
111
 
112
- def run_example_once(example_name, example_text, run_index, example_dir):
113
  """Run a single example once and return the result"""
114
  run_dir = os.path.join(example_dir, f"run_{run_index}")
115
  os.makedirs(run_dir, exist_ok=True)
@@ -135,7 +142,7 @@ def run_example_once(example_name, example_text, run_index, example_dir):
135
  desktop.commands.run(setup_cmd)
136
 
137
  # Create and run the agent
138
- agent = create_agent(data_dir=run_dir, desktop=desktop)
139
  try:
140
  agent.run(task=example_text)
141
  summary = get_agent_summary_erase_images(agent)
@@ -163,7 +170,7 @@ def run_example_once(example_name, example_text, run_index, example_dir):
163
 
164
  return result
165
 
166
- def run_example(example_name, example_text, num_runs, example_dir):
167
  """Run a single example multiple times using threads for each run"""
168
  thread_safe_print(f"\nRunning example '{example_name}': '{example_text[:50]}...'")
169
 
@@ -171,7 +178,7 @@ def run_example(example_name, example_text, num_runs, example_dir):
171
  with concurrent.futures.ThreadPoolExecutor(max_workers=num_runs) as executor:
172
  # Submit all runs to the executor
173
  future_to_run = {
174
- executor.submit(run_example_once, example_name, example_text, j, example_dir): j
175
  for j in range(num_runs)
176
  }
177
 
@@ -191,7 +198,7 @@ def run_example(example_name, example_text, num_runs, example_dir):
191
 
192
  return results
193
 
194
- def run_evaluation(examples, num_runs, output_dir, max_parallel):
195
  """Run each example n times and save the results"""
196
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
197
  git_hash = get_git_hash()
@@ -218,7 +225,7 @@ def run_evaluation(examples, num_runs, output_dir, max_parallel):
218
 
219
  # Submit all examples to the executor
220
  future_to_example = {
221
- executor.submit(run_example, example_name, example_text, num_runs, example_dirs[example_name]): example_name
222
  for example_name, example_text in examples.items()
223
  }
224
 
@@ -272,6 +279,7 @@ def main():
272
  parser.add_argument("--num-runs", type=int, default=3, help="Number of runs per example")
273
  parser.add_argument("--output-dir", type=str, default="./eval_results", help="Output directory for evaluation results")
274
  parser.add_argument("--max-parallel", type=int, default=2, help="Maximum number of examples to run in parallel")
 
275
  args = parser.parse_args()
276
 
277
  # Examples from the original code
@@ -290,7 +298,7 @@ def main():
290
  os.makedirs(args.output_dir, exist_ok=True)
291
 
292
  # Run the evaluation
293
- eval_dir = run_evaluation(examples, args.num_runs, args.output_dir, args.max_parallel)
294
 
295
  if __name__ == "__main__":
296
  main()
 
12
  from e2b_desktop import Sandbox
13
  from huggingface_hub import get_token
14
 
15
+ from smolagents import CodeAgent, OpenAIServerModel
16
  from smolagents.monitoring import LogLevel
17
  from e2bqwen import QwenVLAPIModel, E2BVisionAgent
18
 
19
+ from dotenv import load_dotenv
20
+
21
+ load_dotenv()
22
  # Environment variables and constants
23
  E2B_API_KEY = os.getenv("E2B_API_KEY")
24
  # Try to get token dynamically, fall back to environment variable
 
56
  except:
57
  return "nogit"
58
 
59
+ def create_agent(data_dir, desktop, max_steps: int):
60
  """Create an agent with the E2B desktop sandbox"""
61
  model = QwenVLAPIModel(
62
  model_id="Qwen/Qwen2.5-VL-72B-Instruct",
63
  hf_token=HUGGINGFACE_API_KEY,
64
  )
65
+ # model = OpenAIServerModel(
66
+ # model_id="gpt-4o",
67
+ # api_key=os.getenv("OPENAI_API_KEY")
68
+ # )
69
  return E2BVisionAgent(
70
  model=model,
71
  data_dir=data_dir,
72
  desktop=desktop,
73
+ max_steps=max_steps,
74
  verbosity_level=2,
75
  planning_interval=10,
76
  )
 
116
  "error_message": error_message
117
  }, default=chat_message_to_json))
118
 
119
+ def run_example_once(example_name, example_text, run_index, example_dir, max_steps):
120
  """Run a single example once and return the result"""
121
  run_dir = os.path.join(example_dir, f"run_{run_index}")
122
  os.makedirs(run_dir, exist_ok=True)
 
142
  desktop.commands.run(setup_cmd)
143
 
144
  # Create and run the agent
145
+ agent = create_agent(data_dir=run_dir, desktop=desktop, max_steps=max_steps)
146
  try:
147
  agent.run(task=example_text)
148
  summary = get_agent_summary_erase_images(agent)
 
170
 
171
  return result
172
 
173
+ def run_example(example_name, example_text, num_runs, example_dir, max_steps):
174
  """Run a single example multiple times using threads for each run"""
175
  thread_safe_print(f"\nRunning example '{example_name}': '{example_text[:50]}...'")
176
 
 
178
  with concurrent.futures.ThreadPoolExecutor(max_workers=num_runs) as executor:
179
  # Submit all runs to the executor
180
  future_to_run = {
181
+ executor.submit(run_example_once, example_name, example_text, j, example_dir, max_steps): j
182
  for j in range(num_runs)
183
  }
184
 
 
198
 
199
  return results
200
 
201
+ def run_evaluation(examples, num_runs, output_dir, max_parallel, max_steps):
202
  """Run each example n times and save the results"""
203
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
204
  git_hash = get_git_hash()
 
225
 
226
  # Submit all examples to the executor
227
  future_to_example = {
228
+ executor.submit(run_example, example_name, example_text, num_runs, example_dirs[example_name], max_steps): example_name
229
  for example_name, example_text in examples.items()
230
  }
231
 
 
279
  parser.add_argument("--num-runs", type=int, default=3, help="Number of runs per example")
280
  parser.add_argument("--output-dir", type=str, default="./eval_results", help="Output directory for evaluation results")
281
  parser.add_argument("--max-parallel", type=int, default=2, help="Maximum number of examples to run in parallel")
282
+ parser.add_argument("--max-steps", type=int, default=200, help="Maximum number of steps in each run")
283
  args = parser.parse_args()
284
 
285
  # Examples from the original code
 
298
  os.makedirs(args.output_dir, exist_ok=True)
299
 
300
  # Run the evaluation
301
+ eval_dir = run_evaluation(examples, args.num_runs, args.output_dir, args.max_parallel, args.max_steps)
302
 
303
  if __name__ == "__main__":
304
  main()