Spaces:

smolagents
/

computer-use-agent

Running

A-Mahla commited on Nov 13

Commit

8f4ea43

unverified ·

1 Parent(s): dcaec95

Amir/handle sandbox (#18)

* ADD sandbox gestion

* ADD better sandbox gestion

* ADD better sandbox gestion

* ADD better sandbox gestion

* ADD better sandbox gestion

* ADD better sandbox gestion

Files changed (16) hide show

README.md +26 -0
cua2-core/pyproject.toml +1 -0
cua2-core/pytest.ini +1 -0
cua2-core/src/cua2_core/models/models.py +13 -2
cua2-core/src/cua2_core/routes/websocket.py +9 -2
cua2-core/src/cua2_core/services/agent_service.py +136 -37
cua2-core/src/cua2_core/services/archival_service.py +423 -0
cua2-core/src/cua2_core/services/sandbox_service.py +133 -43
cua2-core/tests/test_archival_service.py +585 -0
cua2-front/src/App.tsx +2 -1
cua2-front/src/components/sandbox/completionview/CompletionView.tsx +12 -12
cua2-front/src/hooks/useAgentWebSocket.ts +27 -9
cua2-front/src/pages/Index.tsx +0 -147
cua2-front/src/pages/Task.tsx +24 -13
cua2-front/src/stores/agentStore.ts +65 -56
cua2-front/src/types/agent.ts +1 -0

README.md CHANGED Viewed

@@ -93,8 +93,34 @@ cp env.example .env
 Edit `.env` with your configuration:
 - API keys for OpenAI/LiteLLM
 - Database connections (if applicable)
 - Other service credentials
 ### 4. Start Development Servers
 #### Option 1: Using Makefile (Recommended)

 Edit `.env` with your configuration:
 - API keys for OpenAI/LiteLLM
 - Database connections (if applicable)
+- HuggingFace credentials for data archival (optional)
 - Other service credentials
+#### Data Archival Configuration (Optional)
+CUA2 includes an automatic data archival feature that backs up old trace data to HuggingFace datasets:
+```bash
+# HuggingFace token for uploading archived data
+HF_TOKEN=your_huggingface_token_here
+# HuggingFace dataset repository ID (e.g., "username/dataset-name")
+HF_DATASET_REPO=your_username/your_dataset_repo
+# Check interval (default: 30 minutes)
+ARCHIVE_INTERVAL_MINUTES=30
+# Age threshold - folders older than this will be archived (default: 30 minutes)
+FOLDER_AGE_THRESHOLD_MINUTES=30
+```
+**How it works:**
+1. Every 30 minutes (configurable), the system checks the `data/` folder for trace folders
+2. Folders older than 30 minutes (configurable) are compressed into `.tar.gz` archives
+3. Archives are uploaded to your HuggingFace dataset repository
+4. After verifying successful upload, local folders are deleted to free up space
+5. This keeps your disk usage minimal while preserving all agent traces in the cloud
 ### 4. Start Development Servers
 #### Option 1: Using Makefile (Recommended)

cua2-core/pyproject.toml CHANGED Viewed

@@ -36,6 +36,7 @@ dependencies = [
     "smolagents[openai,litellm]==1.22.0",
     "openai==2.6.1",
     "e2b-desktop==2.1.0",
 ]
 [project.optional-dependencies]

     "smolagents[openai,litellm]==1.22.0",
     "openai==2.6.1",
     "e2b-desktop==2.1.0",
+    "huggingface_hub==1.1.2",
 ]
 [project.optional-dependencies]

cua2-core/pytest.ini CHANGED Viewed

@@ -11,3 +11,4 @@ addopts =
 markers =
     unit: Unit tests
     integration: Integration tests

 markers =
     unit: Unit tests
     integration: Integration tests
+    slow: Slow tests that take more time to execute

cua2-core/src/cua2_core/models/models.py CHANGED Viewed

@@ -3,6 +3,7 @@ import os
 import threading
 from datetime import datetime
 from typing import Annotated, Literal, Optional
 from cua2_core.services.agent_utils.function_parser import FunctionCall
 from pydantic import BaseModel, Field, PrivateAttr, field_serializer, model_validator
@@ -106,6 +107,15 @@ class AgentStep(BaseModel):
     thought: Optional[str] = None
     actions: list[AgentAction] = []
     @field_serializer("actions")
     def serialize_actions(self, actions: list[AgentAction], _info):
         """Convert actions to list of strings when dumping (controlled by context)"""
@@ -206,6 +216,7 @@ class HeartbeatEvent(BaseModel):
     """Heartbeat event"""
     type: Literal["heartbeat"] = "heartbeat"
 WebSocketEvent: TypeAlias = Annotated[
@@ -266,7 +277,7 @@ class ActiveTask(BaseModel):
                     self.model_dump(
                         mode="json",
                         exclude={"_file_locks"},
-                        context={"actions_as_json": True},
                     ),
                     f,
                     indent=2,
@@ -286,7 +297,7 @@ class ActiveTask(BaseModel):
                     self.model_dump(
                         mode="json",
                         exclude={"_file_locks"},
-                        context={"actions_as_json": True},
                     ),
                     f,
                     indent=2,

 import threading
 from datetime import datetime
 from typing import Annotated, Literal, Optional
+from uuid import uuid4
 from cua2_core.services.agent_utils.function_parser import FunctionCall
 from pydantic import BaseModel, Field, PrivateAttr, field_serializer, model_validator
     thought: Optional[str] = None
     actions: list[AgentAction] = []
+    @field_serializer("image")
+    def serialize_image(self, image: str, _info):
+        """Convert image to path when dumping to JSON"""
+        if _info.context and _info.context.get("image_as_path", True):
+            return f"{self.traceId}-{self.stepId}.png"
+        return image
     @field_serializer("actions")
     def serialize_actions(self, actions: list[AgentAction], _info):
         """Convert actions to list of strings when dumping (controlled by context)"""
     """Heartbeat event"""
     type: Literal["heartbeat"] = "heartbeat"
+    uuid: str = Field(default_factory=lambda: str(uuid4()))
 WebSocketEvent: TypeAlias = Annotated[
                     self.model_dump(
                         mode="json",
                         exclude={"_file_locks"},
+                        context={"actions_as_json": True, "image_as_path": True},
                     ),
                     f,
                     indent=2,
                     self.model_dump(
                         mode="json",
                         exclude={"_file_locks"},
+                        context={"actions_as_json": True, "image_as_path": True},
                     ),
                     f,
                     indent=2,

cua2-core/src/cua2_core/routes/websocket.py CHANGED Viewed

@@ -21,8 +21,9 @@ async def websocket_endpoint(websocket: WebSocket):
     await websocket_manager.connect(websocket)
     try:
-        # Send welcome heartbeat
-        welcome_message = HeartbeatEvent(type="heartbeat")
         await websocket_manager.send_message(welcome_message, websocket)
         # Keep the connection alive and wait for messages
@@ -100,5 +101,11 @@ async def websocket_endpoint(websocket: WebSocket):
     except Exception as e:
         print(f"WebSocket connection error: {e}")
     finally:
         # Ensure cleanup happens
         websocket_manager.disconnect(websocket)

     await websocket_manager.connect(websocket)
     try:
+        welcome_message = HeartbeatEvent(
+            uuid=await agent_service.create_id_and_sandbox(websocket)
+        )
         await websocket_manager.send_message(welcome_message, websocket)
         # Keep the connection alive and wait for messages
     except Exception as e:
         print(f"WebSocket connection error: {e}")
     finally:
+        # Cleanup tasks and sandboxes associated with this websocket
+        try:
+            await agent_service.cleanup_tasks_for_websocket(websocket)
+        except Exception as e:
+            print(f"Error cleaning up tasks for websocket: {e}")
         # Ensure cleanup happens
         websocket_manager.disconnect(websocket)

cua2-core/src/cua2_core/services/agent_service.py CHANGED Viewed

@@ -6,6 +6,7 @@ import os
 import time
 from io import BytesIO
 from typing import Callable, Literal
 from cua2_core.models.models import (
     ActiveTask,
@@ -17,12 +18,14 @@ from cua2_core.models.models import (
 from cua2_core.services.agent_utils.desktop_agent import E2BVisionAgent
 from cua2_core.services.agent_utils.function_parser import parse_function_call
 from cua2_core.services.agent_utils.get_model import get_model
 from cua2_core.services.sandbox_service import SandboxService
 from cua2_core.websocket.websocket_manager import WebSocketException, WebSocketManager
 from e2b_desktop import Sandbox, TimeoutException
 from fastapi import WebSocket
 from PIL import Image
 from smolagents import ActionStep, AgentImage, AgentMaxStepsError, TaskStep
 logger = logging.getLogger(__name__)
@@ -50,6 +53,35 @@ class AgentService:
         self._lock = asyncio.Lock()
         self.max_sandboxes = int(600 / num_workers)
     async def process_user_task(
         self, trace: AgentTrace, websocket: WebSocket
     ) -> str | None:
@@ -60,6 +92,9 @@ class AgentService:
         trace.traceMetadata = AgentTraceMetadata(traceId=trace_id)
         async with self._lock:
             active_task = ActiveTask(
                 message_id=trace_id,
                 instruction=trace.instruction,
@@ -79,9 +114,11 @@ class AgentService:
             # Store the task and websocket for this task
             self.active_tasks[trace_id] = active_task
-            self.task_websockets[trace_id] = websocket
             self.last_screenshot[trace_id] = None
         asyncio.create_task(self._agent_processing(trace_id))
         return trace_id
@@ -111,8 +148,15 @@ class AgentService:
             model = get_model(self.active_tasks[message_id].model_id)
-            # Acquire a sandbox from the pool
-            sandbox = await self.sandbox_service.acquire_sandbox(message_id)
             if sandbox is None:
                 raise Exception("No sandbox available: pool limit reached")
@@ -180,7 +224,12 @@ class AgentService:
         finally:
             # Send completion event
-            if not websocket_exception:
                 await self.websocket_manager.send_agent_complete(
                     metadata=self.active_tasks[message_id].traceMetadata,
                     websocket=websocket,
@@ -210,9 +259,12 @@ class AgentService:
                 if message_id in self.last_screenshot:
                     del self.last_screenshot[message_id]
             # Release sandbox back to the pool
             if sandbox:
-                await self.sandbox_service.release_sandbox(sandbox)
     async def _agent_processing(
         self,
@@ -236,24 +288,8 @@ class AgentService:
             if memory_step.step_number > agent.max_steps:
                 raise AgentStopException("Max steps reached")
-            time.sleep(3)
-            image = self.last_screenshot[message_id]
-            assert image is not None
-            for previous_memory_step in (
-                agent.memory.steps
-            ):  # Remove previous screenshots from logs for lean processing
-                if (
-                    isinstance(previous_memory_step, ActionStep)
-                    and previous_memory_step.step_number is not None
-                    and previous_memory_step.step_number <= memory_step.step_number - 1
-                ):
-                    previous_memory_step.observations_images = None
-                elif isinstance(previous_memory_step, TaskStep):
-                    previous_memory_step.task_images = None
-            memory_step.observations_images = [image.copy()]
             model_output = (
                 memory_step.model_output_message.content
@@ -280,6 +316,35 @@ class AgentService:
                     """The task failed due to an error"""  # TODO: To Handle in front
                 )
             if memory_step.observations_images:
                 image = memory_step.observations_images[0]
                 buffered = BytesIO()
@@ -295,11 +360,7 @@ class AgentService:
                 stepId=str(memory_step.step_number),
                 image=image_base64,
                 thought=thought,
-                actions=AgentAction.from_function_calls(
-                    parse_function_call(action_sequence)
-                )
-                if action_sequence
-                else None,
                 error=memory_step.error.message if memory_step.error else None,
                 duration=memory_step.timing.duration,
                 inputTokensUsed=memory_step.token_usage.input_tokens,
@@ -317,15 +378,16 @@ class AgentService:
             self.active_tasks[message_id].update_step(step)
             websocket = self.task_websockets.get(message_id)
-            future = asyncio.run_coroutine_threadsafe(
-                self.websocket_manager.send_agent_progress(
-                    step=step,
-                    metadata=self.active_tasks[message_id].traceMetadata,
-                    websocket=websocket,
-                ),
-                loop,
-            )
-            future.result()
             if self.active_tasks[message_id].traceMetadata.completed:
                 raise AgentStopException("Task not completed")
@@ -419,3 +481,40 @@ class AgentService:
             self.active_tasks[trace_id].update_trace_metadata(
                 completed=True,
             )

 import time
 from io import BytesIO
 from typing import Callable, Literal
+from uuid import uuid4
 from cua2_core.models.models import (
     ActiveTask,
 from cua2_core.services.agent_utils.desktop_agent import E2BVisionAgent
 from cua2_core.services.agent_utils.function_parser import parse_function_call
 from cua2_core.services.agent_utils.get_model import get_model
+from cua2_core.services.archival_service import ArchivalService
 from cua2_core.services.sandbox_service import SandboxService
 from cua2_core.websocket.websocket_manager import WebSocketException, WebSocketManager
 from e2b_desktop import Sandbox, TimeoutException
 from fastapi import WebSocket
 from PIL import Image
 from smolagents import ActionStep, AgentImage, AgentMaxStepsError, TaskStep
+from starlette.websockets import WebSocketState
 logger = logging.getLogger(__name__)
         self._lock = asyncio.Lock()
         self.max_sandboxes = int(600 / num_workers)
+        # Initialize archival service in dedicated process
+        self.archival_service = ArchivalService(
+            hf_token=os.getenv("HF_TOKEN"),
+            hf_dataset_repo="smolagents/cua_traces",
+            data_dir="data",
+            archive_interval_minutes=30,
+            folder_age_threshold_minutes=30,
+        )
+        # Start the archival service process
+        self.archival_service.start()
+    def _update_archival_active_tasks(self):
+        """
+        Update the archival service with current active task IDs.
+        Should be called whenever tasks are added or removed.
+        """
+        if self.archival_service.is_alive():
+            self.archival_service.update_active_tasks(set(self.active_tasks.keys()))
+    async def create_id_and_sandbox(self, websocket: WebSocket) -> str:
+        """Create a new ID and sandbox"""
+        async with self._lock:
+            uuid = str(uuid4())
+            while uuid in self.active_tasks:
+                uuid = str(uuid4())
+            self.task_websockets[uuid] = websocket
+        await self.sandbox_service.acquire_sandbox(uuid)
+        return uuid
     async def process_user_task(
         self, trace: AgentTrace, websocket: WebSocket
     ) -> str | None:
         trace.traceMetadata = AgentTraceMetadata(traceId=trace_id)
         async with self._lock:
+            if self.task_websockets[trace_id] != websocket:
+                raise WebSocketException("WebSocket mismatch")
             active_task = ActiveTask(
                 message_id=trace_id,
                 instruction=trace.instruction,
             # Store the task and websocket for this task
             self.active_tasks[trace_id] = active_task
             self.last_screenshot[trace_id] = None
+        # Update archival service with new active task
+        self._update_archival_active_tasks()
         asyncio.create_task(self._agent_processing(trace_id))
         return trace_id
             model = get_model(self.active_tasks[message_id].model_id)
+            max_attempts = 10
+            for _ in range(max_attempts):
+                response = await self.sandbox_service.acquire_sandbox(message_id)
+                if response.sandbox is not None and response.state == "ready":
+                    sandbox = response.sandbox
+                    break
+                elif response.state == "max_sandboxes_reached":
+                    raise Exception("No sandbox available: pool limit reached")
+                await asyncio.sleep(2)
             if sandbox is None:
                 raise Exception("No sandbox available: pool limit reached")
         finally:
             # Send completion event
+            # Check if websocket is still connected before sending
+            if (
+                not websocket_exception
+                and websocket
+                and websocket.client_state == WebSocketState.CONNECTED
+            ):
                 await self.websocket_manager.send_agent_complete(
                     metadata=self.active_tasks[message_id].traceMetadata,
                     websocket=websocket,
                 if message_id in self.last_screenshot:
                     del self.last_screenshot[message_id]
+            # Update archival service after task removal
+            self._update_archival_active_tasks()
             # Release sandbox back to the pool
             if sandbox:
+                await self.sandbox_service.release_sandbox(message_id)
     async def _agent_processing(
         self,
             if memory_step.step_number > agent.max_steps:
                 raise AgentStopException("Max steps reached")
+            if self.active_tasks[message_id].traceMetadata.completed:
+                raise AgentStopException("Task not completed")
             model_output = (
                 memory_step.model_output_message.content
                     """The task failed due to an error"""  # TODO: To Handle in front
                 )
+            agent_actions = (
+                AgentAction.from_function_calls(parse_function_call(action_sequence))
+                if action_sequence
+                else None
+            )
+            if not (
+                agent_actions is not None
+                and any(action.function_name == "wait" for action in agent_actions)
+            ):
+                time.sleep(3)
+            image = self.last_screenshot[message_id]
+            assert image is not None
+            for previous_memory_step in (
+                agent.memory.steps
+            ):  # Remove previous screenshots from logs for lean processing
+                if (
+                    isinstance(previous_memory_step, ActionStep)
+                    and previous_memory_step.step_number is not None
+                    and previous_memory_step.step_number <= memory_step.step_number - 1
+                ):
+                    previous_memory_step.observations_images = None
+                elif isinstance(previous_memory_step, TaskStep):
+                    previous_memory_step.task_images = None
+            memory_step.observations_images = [image.copy()]
             if memory_step.observations_images:
                 image = memory_step.observations_images[0]
                 buffered = BytesIO()
                 stepId=str(memory_step.step_number),
                 image=image_base64,
                 thought=thought,
+                actions=agent_actions,
                 error=memory_step.error.message if memory_step.error else None,
                 duration=memory_step.timing.duration,
                 inputTokensUsed=memory_step.token_usage.input_tokens,
             self.active_tasks[message_id].update_step(step)
             websocket = self.task_websockets.get(message_id)
+            if websocket and websocket.client_state == WebSocketState.CONNECTED:
+                future = asyncio.run_coroutine_threadsafe(
+                    self.websocket_manager.send_agent_progress(
+                        step=step,
+                        metadata=self.active_tasks[message_id].traceMetadata,
+                        websocket=websocket,
+                    ),
+                    loop,
+                )
+                future.result()
             if self.active_tasks[message_id].traceMetadata.completed:
                 raise AgentStopException("Task not completed")
             self.active_tasks[trace_id].update_trace_metadata(
                 completed=True,
             )
+    async def cleanup_tasks_for_websocket(self, websocket: WebSocket):
+        """
+        Clean up all tasks associated with a disconnected websocket.
+        This will stop the tasks and release their sandboxes.
+        """
+        tasks_to_cleanup = []
+        # Find all message_ids associated with this websocket
+        async with self._lock:
+            for message_id, ws in list(self.task_websockets.items()):
+                if ws == websocket:
+                    tasks_to_cleanup.append(message_id)
+                    logger.info(
+                        f"Marking task {message_id} for cleanup due to websocket disconnect"
+                    )
+        # Cleanup each task
+        for message_id in tasks_to_cleanup:
+            try:
+                # Mark task as completed to stop the agent
+                if message_id in self.active_tasks:
+                    self.active_tasks[message_id].update_trace_metadata(
+                        completed=True,
+                    )
+                    logger.info(
+                        f"Stopped task {message_id} due to websocket disconnect"
+                    )
+                # Release the sandbox immediately
+                await self.sandbox_service.release_sandbox(message_id)
+                logger.info(
+                    f"Released sandbox for task {message_id} due to websocket disconnect"
+                )
+            except Exception as e:
+                logger.error(f"Error cleaning up task {message_id}: {e}", exc_info=True)

cua2-core/src/cua2_core/services/archival_service.py ADDED Viewed

	@@ -0,0 +1,423 @@

+"""
+Service for automatic data archival to HuggingFace datasets.
+This service runs in a dedicated process that periodically:
+1. Scans for old trace data folders
+2. Compresses them into tar.gz archives
+3. Uploads to HuggingFace dataset repository
+4. Verifies successful upload
+5. Deletes local files only after verification
+"""
+import logging
+import multiprocessing
+import multiprocessing.synchronize
+import os
+import shutil
+import signal
+import tarfile
+import time
+from pathlib import Path
+from typing import Any
+from huggingface_hub import HfApi, hf_hub_download
+from huggingface_hub.utils import HfHubHTTPError
+# Configure logging for the process
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+)
+logger = logging.getLogger(__name__)
+class ArchivalService:
+    """Service for handling automatic data archival to HuggingFace in a dedicated process"""
+    def __init__(
+        self,
+        hf_token: str | None = os.getenv("HF_TOKEN"),
+        hf_dataset_repo: str | None = "smolagents/cua_traces",
+        data_dir: str = "data",
+        archive_interval_minutes: int = 30,
+        folder_age_threshold_minutes: int = 30,
+    ):
+        """
+        Initialize the archival service.
+        Args:
+            hf_token: HuggingFace API token
+            hf_dataset_repo: HuggingFace dataset repository ID (e.g., "username/dataset-name")
+            data_dir: Directory containing trace data folders
+            archive_interval_minutes: How often to check for old folders
+            folder_age_threshold_minutes: Minimum age before archival
+        """
+        self.hf_token = hf_token
+        self.hf_dataset_repo = hf_dataset_repo
+        self.data_dir = data_dir
+        self.archive_interval_minutes = archive_interval_minutes
+        self.folder_age_threshold_minutes = folder_age_threshold_minutes
+        # Multiprocessing components
+        self._process: multiprocessing.Process | None = None
+        self._stop_event: multiprocessing.synchronize.Event = multiprocessing.Event()
+        self._manager = multiprocessing.Manager()
+        self._active_tasks: Any = self._manager.dict()  # DictProxy type
+    def start(self):
+        """Start the archival service in a dedicated process."""
+        if self._process and self._process.is_alive():
+            logger.warning("Archival service is already running")
+            return
+        if not self.hf_token or not self.hf_dataset_repo:
+            logger.warning(
+                "HuggingFace credentials or dataset repo not configured. Data archival disabled."
+            )
+            return
+        self._stop_event.clear()
+        self._process = multiprocessing.Process(
+            target=_archival_worker_process,
+            args=(
+                self.hf_token,
+                self.hf_dataset_repo,
+                self.data_dir,
+                self.archive_interval_minutes,
+                self.folder_age_threshold_minutes,
+                self._stop_event,
+                self._active_tasks,
+            ),
+            daemon=True,
+            name="ArchivalWorker",
+        )
+        self._process.start()
+        logger.info(
+            f"Started archival service in process {self._process.pid}. "
+            f"Checking every {self.archive_interval_minutes} minutes."
+        )
+    def stop(self, timeout: float = 10.0):
+        """
+        Stop the archival service process.
+        Args:
+            timeout: Maximum time to wait for process to terminate (seconds)
+        """
+        if not self._process or not self._process.is_alive():
+            return
+        logger.info(f"Stopping archival service (PID: {self._process.pid})...")
+        self._stop_event.set()
+        self._process.join(timeout=timeout)
+        if self._process.is_alive():
+            logger.warning("Archival process did not stop gracefully, terminating...")
+            self._process.terminate()
+            self._process.join(timeout=2.0)
+            if self._process.is_alive():
+                logger.error("Force killing archival process...")
+                self._process.kill()
+                self._process.join()
+        logger.info("Archival service stopped")
+        self._process = None
+    def update_active_tasks(self, active_task_ids: set[str]):
+        """
+        Update the set of active task IDs.
+        The archival process will skip folders for these tasks.
+        Args:
+            active_task_ids: Set of currently active trace IDs
+        """
+        # Clear and update the shared dict
+        self._active_tasks.clear()
+        for task_id in active_task_ids:
+            self._active_tasks[task_id] = True
+    def is_alive(self) -> bool:
+        """Check if the archival process is running."""
+        return self._process is not None and self._process.is_alive()
+def _archival_worker_process(
+    hf_token: str,
+    hf_dataset_repo: str,
+    data_dir: str,
+    archive_interval_minutes: int,
+    folder_age_threshold_minutes: int,
+    stop_event: multiprocessing.synchronize.Event,
+    active_tasks: Any,
+):
+    """
+    Worker process that performs the archival operations.
+    Runs in a separate process from the main application.
+    Args:
+        hf_token: HuggingFace API token
+        hf_dataset_repo: HuggingFace dataset repository
+        data_dir: Data directory path
+        archive_interval_minutes: Check interval
+        folder_age_threshold_minutes: Folder age threshold
+        stop_event: Event to signal process shutdown
+        active_tasks: Shared dict of active task IDs
+    """
+    def signal_handler(signum, frame):
+        """Handle termination signals gracefully."""
+        logger.info(f"Received signal {signum}, stopping archival worker...")
+        stop_event.set()
+    # Register signal handlers
+    signal.signal(signal.SIGTERM, signal_handler)
+    signal.signal(signal.SIGINT, signal_handler)
+    # Initialize HuggingFace API in this process
+    hf_api = HfApi(token=hf_token)
+    logger.info(
+        f"Archival worker started (PID: {os.getpid()}). "
+        f"Checking every {archive_interval_minutes} minutes."
+    )
+    # Main worker loop
+    while not stop_event.is_set():
+        try:
+            # Sleep in small intervals to be responsive to stop_event
+            for _ in range(archive_interval_minutes * 60):
+                if stop_event.is_set():
+                    break
+                time.sleep(1)
+            if stop_event.is_set():
+                break
+            logger.info("Starting data archival check...")
+            _process_old_folders(
+                data_dir=data_dir,
+                folder_age_threshold_minutes=folder_age_threshold_minutes,
+                active_tasks=active_tasks,
+                hf_api=hf_api,
+                hf_dataset_repo=hf_dataset_repo,
+                hf_token=hf_token,
+            )
+        except Exception as e:
+            logger.error(f"Error in archival worker: {e}", exc_info=True)
+            # Continue running despite errors
+    logger.info("Archival worker shutting down gracefully")
+def _process_old_folders(
+    data_dir: str,
+    folder_age_threshold_minutes: int,
+    active_tasks: Any,
+    hf_api: HfApi,
+    hf_dataset_repo: str,
+    hf_token: str,
+):
+    """
+    Process and archive folders older than the threshold.
+    Runs in the archival worker process.
+    """
+    if not os.path.exists(data_dir):
+        logger.warning(f"Data directory {data_dir} does not exist")
+        return
+    data_path = Path(data_dir)
+    current_time = time.time()
+    threshold_seconds = folder_age_threshold_minutes * 60
+    # Get all trace folders
+    try:
+        trace_folders = [
+            f for f in data_path.iterdir() if f.is_dir() and f.name.startswith("trace-")
+        ]
+    except Exception as e:
+        logger.error(f"Error listing data directory: {e}", exc_info=True)
+        return
+    for folder in trace_folders:
+        try:
+            # Check if folder is old enough and not currently active
+            folder_mtime = folder.stat().st_mtime
+            folder_age_seconds = current_time - folder_mtime
+            # Extract trace_id from folder name (format: trace-{uuid}-{model_name})
+            folder_name = folder.name
+            parts = folder_name.split("-", 2)  # Split into ['trace', uuid, model_name]
+            if len(parts) < 2:
+                logger.warning(f"Unexpected folder name format: {folder_name}")
+                continue
+            trace_id = parts[1]
+            # Skip if folder is still being used by an active task
+            if trace_id in active_tasks:
+                logger.debug(f"Skipping active task folder: {folder_name}")
+                continue
+            # Check if folder is old enough
+            if folder_age_seconds < threshold_seconds:
+                logger.debug(
+                    f"Folder {folder_name} is not old enough ({folder_age_seconds / 60:.1f} minutes)"
+                )
+                continue
+            logger.info(
+                f"Processing old folder: {folder_name} (age: {folder_age_seconds / 60:.1f} minutes)"
+            )
+            # Compress the folder
+            archive_path = _compress_folder(folder)
+            if not archive_path:
+                logger.error(f"Failed to compress folder: {folder_name}")
+                continue
+            # Upload to HuggingFace
+            uploaded = _upload_to_huggingface(hf_api, hf_dataset_repo, archive_path)
+            if not uploaded:
+                logger.error(f"Failed to upload archive: {archive_path.name}")
+                # Clean up the local archive file
+                archive_path.unlink(missing_ok=True)
+                continue
+            # Verify the file exists in the repo
+            verified = _verify_file_in_repo(
+                hf_dataset_repo, hf_token, archive_path.name
+            )
+            if verified:
+                logger.info(
+                    f"Successfully verified {archive_path.name} in HuggingFace repo"
+                )
+                # Delete the local folder
+                shutil.rmtree(folder)
+                logger.info(f"Deleted local folder: {folder_name}")
+                # Delete the local archive
+                archive_path.unlink(missing_ok=True)
+                logger.info(f"Deleted local archive: {archive_path.name}")
+            else:
+                logger.error(
+                    f"Could not verify {archive_path.name} in repo. Keeping local files."
+                )
+                # Keep both the folder and archive for safety
+        except Exception as e:
+            logger.error(f"Error processing folder {folder.name}: {e}", exc_info=True)
+def _compress_folder(folder_path: Path) -> Path | None:
+    """
+    Compress a folder into a tar.gz archive.
+    Args:
+        folder_path: Path to the folder to compress
+    Returns:
+        Path to the created archive file, or None if failed
+    """
+    try:
+        archive_name = f"{folder_path.name}.tar.gz"
+        archive_path = folder_path.parent / archive_name
+        logger.info(f"Compressing {folder_path.name} to {archive_name}")
+        with tarfile.open(archive_path, "w:gz") as tar:
+            tar.add(folder_path, arcname=folder_path.name)
+        archive_size = archive_path.stat().st_size
+        logger.info(
+            f"Created archive {archive_name} ({archive_size / 1024 / 1024:.2f} MB)"
+        )
+        return archive_path
+    except Exception as e:
+        logger.error(f"Error compressing folder {folder_path}: {e}", exc_info=True)
+        return None
+def _upload_to_huggingface(
+    hf_api: HfApi, hf_dataset_repo: str, archive_path: Path
+) -> bool:
+    """
+    Upload an archive file to HuggingFace dataset repository.
+    Args:
+        hf_api: HuggingFace API client
+        hf_dataset_repo: HuggingFace dataset repository ID
+        archive_path: Path to the archive file
+    Returns:
+        True if upload succeeded, False otherwise
+    """
+    try:
+        logger.info(
+            f"Uploading {archive_path.name} to HuggingFace repo {hf_dataset_repo}"
+        )
+        hf_api.upload_file(
+            path_or_fileobj=str(archive_path),
+            path_in_repo=archive_path.name,
+            repo_id=hf_dataset_repo,
+            repo_type="dataset",
+        )
+        logger.info(f"Successfully uploaded {archive_path.name} to HuggingFace")
+        return True
+    except Exception as e:
+        logger.error(
+            f"Error uploading {archive_path.name} to HuggingFace: {e}", exc_info=True
+        )
+        return False
+def _verify_file_in_repo(hf_dataset_repo: str, hf_token: str, filename: str) -> bool:
+    """
+    Verify that a file exists in the HuggingFace repository.
+    Args:
+        hf_dataset_repo: HuggingFace dataset repository ID
+        hf_token: HuggingFace API token
+        filename: Name of the file to verify
+    Returns:
+        True if file exists in repo, False otherwise
+    """
+    try:
+        logger.info(f"Verifying {filename} exists in HuggingFace repo")
+        # Try to get file info - this will raise an error if file doesn't exist
+        hf_hub_download(
+            repo_id=hf_dataset_repo,
+            filename=filename,
+            repo_type="dataset",
+            token=hf_token,
+            local_dir_use_symlinks=False,
+            # Just check if file exists without actually downloading
+            cache_dir=None,
+            local_files_only=False,
+        )
+        logger.info(f"Verified {filename} exists in repo")
+        return True
+    except HfHubHTTPError as e:
+        if e.response.status_code == 404:
+            logger.error(f"File {filename} not found in repo (404)")
+        else:
+            logger.error(f"HTTP error verifying file: {e}")
+        return False
+    except Exception as e:
+        logger.error(f"Error verifying {filename} in repo: {e}", exc_info=True)
+        return False

cua2-core/src/cua2_core/services/sandbox_service.py CHANGED Viewed

@@ -1,20 +1,26 @@
 import asyncio
-import logging
 import os
 import time
 from datetime import datetime
-from typing import Any
 from e2b_desktop import Sandbox
-logger = logging.getLogger(__name__)
 SANDBOX_METADATA: dict[str, dict[str, Any]] = {}
-SANDBOX_TIMEOUT = 300
 WIDTH = 1280
 HEIGHT = 960
 class SandboxService:
     def __init__(self, max_sandboxes: int = 50):
         if not os.getenv("E2B_API_KEY"):
@@ -24,70 +30,154 @@ class SandboxService:
         self.sandbox_metadata: dict[str, dict[str, Any]] = {}
         self.sandbox_lock = asyncio.Lock()
-    async def acquire_sandbox(self, session_hash: str) -> Sandbox | None:
-        async with self.sandbox_lock:
-            current_time = datetime.now()
             if (
                 session_hash in self.sandboxes
                 and session_hash in self.sandbox_metadata
-                and current_time - self.sandbox_metadata[session_hash]["created_at"]
-                < SANDBOX_TIMEOUT
             ):
                 print(f"Reusing Sandbox for session {session_hash}")
                 self.sandbox_metadata[session_hash]["last_accessed"] = current_time
-                return self.sandboxes[session_hash]
-            if session_hash in self.sandboxes:
-                try:
-                    print(f"Closing expired sandbox for session {session_hash}")
-                    await asyncio.to_thread(self.sandboxes[session_hash].kill)
-                except Exception as e:
-                    print(f"Error closing expired sandbox: {str(e)}")
-            elif len(self.sandboxes) >= self.max_sandboxes:
-                return None
-            print(f"Creating new sandbox for session {session_hash}")
-            def create_and_setup_sandbox():
-                desktop = Sandbox.create(
-                    api_key=os.getenv("E2B_API_KEY"),
-                    resolution=(WIDTH, HEIGHT),
-                    dpi=96,
-                    timeout=SANDBOX_TIMEOUT,
-                    template="k0wmnzir0zuzye6dndlw",
                 )
-                desktop.stream.start(require_auth=True)
-                setup_cmd = """sudo mkdir -p /usr/lib/firefox-esr/distribution && echo '{"policies":{"OverrideFirstRunPage":"","OverridePostUpdatePage":"","DisableProfileImport":true,"DontCheckDefaultBrowser":true}}' | sudo tee /usr/lib/firefox-esr/distribution/policies.json > /dev/null"""
-                desktop.commands.run(setup_cmd)
-                time.sleep(3)
-                return desktop
-            desktop = await asyncio.to_thread(create_and_setup_sandbox)
-            print(f"Sandbox ID for session {session_hash} is {desktop.sandbox_id}.")
-            self.sandboxes[session_hash] = desktop
             self.sandbox_metadata[session_hash] = {
                 "created_at": current_time,
                 "last_accessed": current_time,
             }
-            return desktop
     async def release_sandbox(self, session_hash: str):
         async with self.sandbox_lock:
             if session_hash in self.sandboxes:
                 print(f"Releasing sandbox for session {session_hash}")
-                await asyncio.to_thread(self.sandboxes[session_hash].kill)
                 del self.sandboxes[session_hash]
-                del self.sandbox_metadata[session_hash]
     async def cleanup_sandboxes(self):
         async with self.sandbox_lock:
             for session_hash in list(self.sandboxes.keys()):
-                await asyncio.to_thread(self.sandboxes[session_hash].kill)
                 del self.sandboxes[session_hash]
-                del self.sandbox_metadata[session_hash]
 if __name__ == "__main__":

 import asyncio
 import os
 import time
 from datetime import datetime
+from typing import Any, Literal
 from e2b_desktop import Sandbox
+from pydantic import BaseModel
 SANDBOX_METADATA: dict[str, dict[str, Any]] = {}
+SANDBOX_TIMEOUT = 500
+SANDBOX_CREATION_TIMEOUT = 180
 WIDTH = 1280
 HEIGHT = 960
+class SandboxResponse(BaseModel):
+    model_config = {"arbitrary_types_allowed": True}
+    sandbox: Sandbox | None
+    state: Literal["creating", "ready", "max_sandboxes_reached"]
 class SandboxService:
     def __init__(self, max_sandboxes: int = 50):
         if not os.getenv("E2B_API_KEY"):
         self.sandbox_metadata: dict[str, dict[str, Any]] = {}
         self.sandbox_lock = asyncio.Lock()
+    async def _create_sandbox_background(
+        self, session_hash: str, expired_sandbox: Sandbox | None
+    ):
+        """Background task to create and setup a sandbox."""
+        # Kill expired sandbox first
+        if expired_sandbox:
+            try:
+                print(f"Closing expired sandbox for session {session_hash}")
+                await asyncio.to_thread(expired_sandbox.kill)
+            except Exception as e:
+                print(f"Error closing expired sandbox: {str(e)}")
+        def create_and_setup_sandbox():
+            desktop = Sandbox.create(
+                api_key=os.getenv("E2B_API_KEY"),
+                resolution=(WIDTH, HEIGHT),
+                dpi=96,
+                timeout=SANDBOX_TIMEOUT,
+                template="k0wmnzir0zuzye6dndlw",
+            )
+            desktop.stream.start(require_auth=True)
+            setup_cmd = """sudo mkdir -p /usr/lib/firefox-esr/distribution && echo '{"policies":{"OverrideFirstRunPage":"","OverridePostUpdatePage":"","DisableProfileImport":true,"DontCheckDefaultBrowser":true}}' | sudo tee /usr/lib/firefox-esr/distribution/policies.json > /dev/null"""
+            desktop.commands.run(setup_cmd)
+            time.sleep(3)
+            return desktop
+        try:
+            desktop = await asyncio.to_thread(create_and_setup_sandbox)
+            print(f"Sandbox ID for session {session_hash} is {desktop.sandbox_id}.")
+            # Log sandbox creation
+            # Update sandbox state under lock
+            async with self.sandbox_lock:
+                self.sandboxes[session_hash] = desktop
+                self.sandbox_metadata[session_hash]["state"] = "ready"
+        except Exception as e:
+            print(f"Error creating sandbox for session {session_hash}: {str(e)}")
+            # Clean up metadata on failure
+            async with self.sandbox_lock:
+                if session_hash in self.sandbox_metadata:
+                    del self.sandbox_metadata[session_hash]
+    async def acquire_sandbox(self, session_hash: str) -> SandboxResponse:
+        current_time = datetime.now()
+        should_create = False
+        expired_sandbox = None
+        # Quick check under lock - only check state and mark creation
+        async with self.sandbox_lock:
+            # Check if sandbox exists and is ready
             if (
                 session_hash in self.sandboxes
                 and session_hash in self.sandbox_metadata
+                and self.sandbox_metadata[session_hash].get("state") == "ready"
+                and (
+                    current_time - self.sandbox_metadata[session_hash]["created_at"]
+                ).total_seconds()
+                < SANDBOX_CREATION_TIMEOUT
             ):
                 print(f"Reusing Sandbox for session {session_hash}")
                 self.sandbox_metadata[session_hash]["last_accessed"] = current_time
+                return SandboxResponse(
+                    sandbox=self.sandboxes[session_hash], state="ready"
                 )
+            # Check if sandbox is already being created
+            if (
+                session_hash in self.sandbox_metadata
+                and self.sandbox_metadata[session_hash].get("state") == "creating"
+            ):
+                print(f"Sandbox for session {session_hash} is already being created")
+                return SandboxResponse(sandbox=None, state="creating")
+            # Mark expired sandbox for cleanup (remove from dict within lock)
+            if session_hash in self.sandboxes:
+                print(f"Marking expired sandbox for session {session_hash} for cleanup")
+                expired_sandbox = self.sandboxes[session_hash]
+                del self.sandboxes[session_hash]
+                if session_hash in self.sandbox_metadata:
+                    del self.sandbox_metadata[session_hash]
+            # Check if we have capacity
+            if len(self.sandboxes) >= self.max_sandboxes:
+                return SandboxResponse(sandbox=None, state="max_sandboxes_reached")
+            # Mark that we're creating this sandbox
+            print(f"Creating new sandbox for session {session_hash}")
             self.sandbox_metadata[session_hash] = {
+                "state": "creating",
                 "created_at": current_time,
                 "last_accessed": current_time,
             }
+            should_create = True
+        # Start sandbox creation in background without waiting
+        if should_create:
+            asyncio.create_task(
+                self._create_sandbox_background(session_hash, expired_sandbox)
+            )
+        async with self.sandbox_lock:
+            if self.sandbox_metadata[session_hash]["state"] == "creating":
+                return SandboxResponse(sandbox=None, state="creating")
+            if self.sandbox_metadata[session_hash]["state"] == "ready":
+                return SandboxResponse(
+                    sandbox=self.sandboxes[session_hash], state="ready"
+                )
+        return SandboxResponse(sandbox=None, state="creating")
     async def release_sandbox(self, session_hash: str):
+        sandbox_to_kill = None
+        # Remove from dictionaries under lock
         async with self.sandbox_lock:
             if session_hash in self.sandboxes:
                 print(f"Releasing sandbox for session {session_hash}")
+                sandbox_to_kill = self.sandboxes[session_hash]
                 del self.sandboxes[session_hash]
+                if session_hash in self.sandbox_metadata:
+                    del self.sandbox_metadata[session_hash]
+        # Kill sandbox outside of lock
+        if sandbox_to_kill:
+            try:
+                await asyncio.to_thread(sandbox_to_kill.kill)
+            except Exception as e:
+                print(f"Error killing sandbox for session {session_hash}: {str(e)}")
     async def cleanup_sandboxes(self):
+        sandboxes_to_kill = []
+        # Collect sandboxes under lock
         async with self.sandbox_lock:
             for session_hash in list(self.sandboxes.keys()):
+                sandboxes_to_kill.append((session_hash, self.sandboxes[session_hash]))
                 del self.sandboxes[session_hash]
+                if session_hash in self.sandbox_metadata:
+                    del self.sandbox_metadata[session_hash]
+        # Kill all sandboxes outside of lock
+        for session_hash, sandbox in sandboxes_to_kill:
+            try:
+                await asyncio.to_thread(sandbox.kill)
+            except Exception as e:
+                print(f"Error killing sandbox for session {session_hash}: {str(e)}")
 if __name__ == "__main__":

cua2-core/tests/test_archival_service.py ADDED Viewed

	@@ -0,0 +1,585 @@

+"""
+Tests for the ArchivalService multiprocessing implementation.
+"""
+import os
+import shutil
+import tarfile
+import tempfile
+import time
+from pathlib import Path
+from unittest.mock import MagicMock, Mock, patch
+import pytest
+from cua2_core.services.archival_service import (
+    ArchivalService,
+    _compress_folder,
+    _process_old_folders,
+    _upload_to_huggingface,
+    _verify_file_in_repo,
+)
+from huggingface_hub.utils import HfHubHTTPError
+@pytest.fixture
+def temp_data_dir():
+    """Create a temporary data directory for testing."""
+    temp_dir = tempfile.mkdtemp()
+    yield temp_dir
+    # Cleanup
+    if os.path.exists(temp_dir):
+        shutil.rmtree(temp_dir)
+@pytest.fixture
+def mock_hf_api():
+    """Create a mock HuggingFace API client."""
+    mock_api = MagicMock()
+    mock_api.upload_file.return_value = None
+    return mock_api
+@pytest.fixture
+def archival_service(temp_data_dir):
+    """Create an ArchivalService instance for testing."""
+    service = ArchivalService(
+        hf_token="test_token",
+        hf_dataset_repo="test_user/test_repo",
+        data_dir=temp_data_dir,
+        archive_interval_minutes=1,  # Short interval for testing
+        folder_age_threshold_minutes=1,
+    )
+    yield service
+    # Cleanup - stop the service if running
+    if service.is_alive():
+        service.stop(timeout=5.0)
+class TestArchivalServiceInitialization:
+    """Test ArchivalService initialization."""
+    def test_init_with_defaults(self):
+        """Test initialization with default values."""
+        with patch.dict(os.environ, {"HF_TOKEN": "env_token"}, clear=False):
+            # Need to pass the env token explicitly since os.getenv is called at function definition time
+            service = ArchivalService(hf_token=os.getenv("HF_TOKEN"))
+            assert service.hf_token == "env_token"
+            assert service.hf_dataset_repo == "smolagents/cua_traces"
+            assert service.data_dir == "data"
+            assert service.archive_interval_minutes == 30
+            assert service.folder_age_threshold_minutes == 30
+            assert service._process is None
+            assert not service.is_alive()
+    def test_init_with_custom_values(self):
+        """Test initialization with custom values."""
+        service = ArchivalService(
+            hf_token="custom_token",
+            hf_dataset_repo="custom/repo",
+            data_dir="/custom/path",
+            archive_interval_minutes=60,
+            folder_age_threshold_minutes=120,
+        )
+        assert service.hf_token == "custom_token"
+        assert service.hf_dataset_repo == "custom/repo"
+        assert service.data_dir == "/custom/path"
+        assert service.archive_interval_minutes == 60
+        assert service.folder_age_threshold_minutes == 120
+    def test_init_multiprocessing_components(self):
+        """Test that multiprocessing components are initialized."""
+        service = ArchivalService(hf_token="test", hf_dataset_repo="test/test")
+        assert service._stop_event is not None
+        assert service._manager is not None
+        assert service._active_tasks is not None
+class TestArchivalServiceLifecycle:
+    """Test ArchivalService lifecycle management."""
+    def test_start_service(self, archival_service):
+        """Test starting the archival service."""
+        archival_service.start()
+        # Give the process a moment to start
+        time.sleep(0.5)
+        assert archival_service.is_alive()
+        assert archival_service._process is not None
+        assert archival_service._process.pid is not None
+    def test_start_without_credentials(self, temp_data_dir):
+        """Test starting service without credentials logs warning."""
+        service = ArchivalService(
+            hf_token=None,
+            hf_dataset_repo=None,
+            data_dir=temp_data_dir,
+        )
+        service.start()
+        # Service should not start
+        assert not service.is_alive()
+        assert service._process is None
+    def test_start_already_running(self, archival_service):
+        """Test starting service when already running."""
+        archival_service.start()
+        time.sleep(0.5)
+        pid1 = archival_service._process.pid
+        # Try to start again
+        archival_service.start()
+        # Should be same process
+        assert archival_service._process.pid == pid1
+    def test_stop_service(self, archival_service):
+        """Test stopping the archival service."""
+        archival_service.start()
+        time.sleep(0.5)
+        assert archival_service.is_alive()
+        archival_service.stop(timeout=5.0)
+        assert not archival_service.is_alive()
+        assert archival_service._process is None
+    def test_stop_not_running(self, archival_service):
+        """Test stopping service when not running."""
+        # Should not raise any errors
+        archival_service.stop(timeout=1.0)
+        assert not archival_service.is_alive()
+    def test_stop_with_timeout(self, archival_service):
+        """Test stop with timeout and force termination."""
+        archival_service.start()
+        time.sleep(0.5)
+        # Stop with very short timeout to test force kill path
+        archival_service.stop(timeout=0.001)
+        # Process should be stopped one way or another
+        time.sleep(0.5)
+        assert not archival_service.is_alive()
+    def test_is_alive_returns_false_when_not_started(self, archival_service):
+        """Test is_alive returns False when service not started."""
+        assert not archival_service.is_alive()
+class TestActiveTasksManagement:
+    """Test active tasks management."""
+    def test_update_active_tasks(self, archival_service):
+        """Test updating active tasks."""
+        task_ids = {"task-1", "task-2", "task-3"}
+        archival_service.update_active_tasks(task_ids)
+        # Verify tasks are in shared dict
+        for task_id in task_ids:
+            assert task_id in archival_service._active_tasks
+    def test_update_active_tasks_clears_old(self, archival_service):
+        """Test that updating active tasks clears old ones."""
+        archival_service.update_active_tasks({"task-1", "task-2"})
+        assert "task-1" in archival_service._active_tasks
+        archival_service.update_active_tasks({"task-3"})
+        assert "task-1" not in archival_service._active_tasks
+        assert "task-3" in archival_service._active_tasks
+    def test_update_active_tasks_empty_set(self, archival_service):
+        """Test updating with empty set."""
+        archival_service.update_active_tasks({"task-1"})
+        archival_service.update_active_tasks(set())
+        assert len(archival_service._active_tasks) == 0
+class TestCompressFolder:
+    """Test folder compression functionality."""
+    def test_compress_folder_success(self, temp_data_dir):
+        """Test successful folder compression."""
+        # Create a test folder with some files
+        test_folder = Path(temp_data_dir) / "trace-test-123-model"
+        test_folder.mkdir()
+        (test_folder / "file1.txt").write_text("test content 1")
+        (test_folder / "file2.txt").write_text("test content 2")
+        archive_path = _compress_folder(test_folder)
+        assert archive_path is not None
+        assert archive_path.exists()
+        assert archive_path.name == "trace-test-123-model.tar.gz"
+        assert archive_path.suffix == ".gz"
+        # Verify archive contents
+        with tarfile.open(archive_path, "r:gz") as tar:
+            members = tar.getnames()
+            assert "trace-test-123-model/file1.txt" in members
+            assert "trace-test-123-model/file2.txt" in members
+        # Cleanup
+        archive_path.unlink()
+    def test_compress_folder_empty_folder(self, temp_data_dir):
+        """Test compressing an empty folder."""
+        test_folder = Path(temp_data_dir) / "trace-empty-456-model"
+        test_folder.mkdir()
+        archive_path = _compress_folder(test_folder)
+        assert archive_path is not None
+        assert archive_path.exists()
+        # Cleanup
+        archive_path.unlink()
+    def test_compress_folder_nonexistent(self, temp_data_dir):
+        """Test compressing a nonexistent folder."""
+        test_folder = Path(temp_data_dir) / "nonexistent"
+        archive_path = _compress_folder(test_folder)
+        assert archive_path is None
+    def test_compress_folder_with_subdirectories(self, temp_data_dir):
+        """Test compressing folder with subdirectories."""
+        test_folder = Path(temp_data_dir) / "trace-nested-789-model"
+        test_folder.mkdir()
+        subdir = test_folder / "subdir"
+        subdir.mkdir()
+        (subdir / "nested.txt").write_text("nested content")
+        archive_path = _compress_folder(test_folder)
+        assert archive_path is not None
+        # Verify nested structure preserved
+        with tarfile.open(archive_path, "r:gz") as tar:
+            members = tar.getnames()
+            assert "trace-nested-789-model/subdir/nested.txt" in members
+        # Cleanup
+        archive_path.unlink()
+class TestUploadToHuggingFace:
+    """Test HuggingFace upload functionality."""
+    def test_upload_success(self, mock_hf_api, temp_data_dir):
+        """Test successful upload to HuggingFace."""
+        # Create a test archive
+        archive_path = Path(temp_data_dir) / "test-archive.tar.gz"
+        archive_path.write_text("test archive content")
+        result = _upload_to_huggingface(mock_hf_api, "test/repo", archive_path)
+        assert result is True
+        mock_hf_api.upload_file.assert_called_once_with(
+            path_or_fileobj=str(archive_path),
+            path_in_repo="test-archive.tar.gz",
+            repo_id="test/repo",
+            repo_type="dataset",
+        )
+    def test_upload_failure(self, mock_hf_api, temp_data_dir):
+        """Test upload failure."""
+        mock_hf_api.upload_file.side_effect = Exception("Upload failed")
+        archive_path = Path(temp_data_dir) / "test-archive.tar.gz"
+        archive_path.write_text("test archive content")
+        result = _upload_to_huggingface(mock_hf_api, "test/repo", archive_path)
+        assert result is False
+    def test_upload_nonexistent_file(self, mock_hf_api, temp_data_dir):
+        """Test uploading a nonexistent file."""
+        archive_path = Path(temp_data_dir) / "nonexistent.tar.gz"
+        # Make the mock raise an exception when trying to upload nonexistent file
+        mock_hf_api.upload_file.side_effect = FileNotFoundError("File not found")
+        result = _upload_to_huggingface(mock_hf_api, "test/repo", archive_path)
+        assert result is False
+class TestVerifyFileInRepo:
+    """Test file verification functionality."""
+    @patch("cua2_core.services.archival_service.hf_hub_download")
+    def test_verify_success(self, mock_download):
+        """Test successful file verification."""
+        mock_download.return_value = "/path/to/file"
+        result = _verify_file_in_repo("test/repo", "test_token", "test.tar.gz")
+        assert result is True
+        mock_download.assert_called_once()
+    @patch("cua2_core.services.archival_service.hf_hub_download")
+    def test_verify_file_not_found(self, mock_download):
+        """Test verification when file not found (404)."""
+        mock_response = Mock()
+        mock_response.status_code = 404
+        error = HfHubHTTPError("Not found", response=mock_response)
+        mock_download.side_effect = error
+        result = _verify_file_in_repo("test/repo", "test_token", "test.tar.gz")
+        assert result is False
+    @patch("cua2_core.services.archival_service.hf_hub_download")
+    def test_verify_other_http_error(self, mock_download):
+        """Test verification with other HTTP errors."""
+        mock_response = Mock()
+        mock_response.status_code = 500
+        error = HfHubHTTPError("Server error", response=mock_response)
+        mock_download.side_effect = error
+        result = _verify_file_in_repo("test/repo", "test_token", "test.tar.gz")
+        assert result is False
+    @patch("cua2_core.services.archival_service.hf_hub_download")
+    def test_verify_generic_exception(self, mock_download):
+        """Test verification with generic exception."""
+        mock_download.side_effect = Exception("Generic error")
+        result = _verify_file_in_repo("test/repo", "test_token", "test.tar.gz")
+        assert result is False
+class TestProcessOldFolders:
+    """Test old folder processing logic."""
+    def test_process_old_folders_basic(self, temp_data_dir, mock_hf_api):
+        """Test processing old folders."""
+        # Create an old folder (modify mtime to make it old)
+        old_folder = Path(temp_data_dir) / "trace-old123-model"
+        old_folder.mkdir()
+        (old_folder / "data.json").write_text('{"test": "data"}')
+        # Make it old by modifying mtime
+        old_time = time.time() - 3600  # 1 hour ago
+        os.utime(old_folder, (old_time, old_time))
+        active_tasks = {}
+        with patch(
+            "cua2_core.services.archival_service._verify_file_in_repo",
+            return_value=True,
+        ):
+            _process_old_folders(
+                data_dir=temp_data_dir,
+                folder_age_threshold_minutes=1,
+                active_tasks=active_tasks,
+                hf_api=mock_hf_api,
+                hf_dataset_repo="test/repo",
+                hf_token="test_token",
+            )
+        # Folder should be deleted after successful archival
+        assert not old_folder.exists()
+        # Upload should have been called
+        assert mock_hf_api.upload_file.called
+    def test_process_folders_skips_active_tasks(self, temp_data_dir, mock_hf_api):
+        """Test that active tasks are skipped."""
+        # Create a folder for an active task
+        active_folder = Path(temp_data_dir) / "trace-active456-model"
+        active_folder.mkdir()
+        (active_folder / "data.json").write_text('{"test": "data"}')
+        # Make it old
+        old_time = time.time() - 3600
+        os.utime(active_folder, (old_time, old_time))
+        # Mark as active
+        active_tasks = {"active456": True}
+        _process_old_folders(
+            data_dir=temp_data_dir,
+            folder_age_threshold_minutes=1,
+            active_tasks=active_tasks,
+            hf_api=mock_hf_api,
+            hf_dataset_repo="test/repo",
+            hf_token="test_token",
+        )
+        # Folder should still exist (not archived)
+        assert active_folder.exists()
+        # Upload should not have been called
+        assert not mock_hf_api.upload_file.called
+    def test_process_folders_skips_recent(self, temp_data_dir, mock_hf_api):
+        """Test that recent folders are skipped."""
+        # Create a recent folder
+        recent_folder = Path(temp_data_dir) / "trace-recent789-model"
+        recent_folder.mkdir()
+        (recent_folder / "data.json").write_text('{"test": "data"}')
+        # Folder is fresh (current time)
+        active_tasks = {}
+        _process_old_folders(
+            data_dir=temp_data_dir,
+            folder_age_threshold_minutes=60,  # 60 minutes threshold
+            active_tasks=active_tasks,
+            hf_api=mock_hf_api,
+            hf_dataset_repo="test/repo",
+            hf_token="test_token",
+        )
+        # Folder should still exist (too recent)
+        assert recent_folder.exists()
+        # Upload should not have been called
+        assert not mock_hf_api.upload_file.called
+    def test_process_folders_keeps_on_verification_failure(
+        self, temp_data_dir, mock_hf_api
+    ):
+        """Test that folders are kept if verification fails."""
+        old_folder = Path(temp_data_dir) / "trace-verify-fail-model"
+        old_folder.mkdir()
+        (old_folder / "data.json").write_text('{"test": "data"}')
+        old_time = time.time() - 3600
+        os.utime(old_folder, (old_time, old_time))
+        active_tasks = {}
+        # Mock verification to fail
+        with patch(
+            "cua2_core.services.archival_service._verify_file_in_repo",
+            return_value=False,
+        ):
+            _process_old_folders(
+                data_dir=temp_data_dir,
+                folder_age_threshold_minutes=1,
+                active_tasks=active_tasks,
+                hf_api=mock_hf_api,
+                hf_dataset_repo="test/repo",
+                hf_token="test_token",
+            )
+        # Folder should still exist (verification failed)
+        assert old_folder.exists()
+    def test_process_folders_handles_nonexistent_dir(self, mock_hf_api):
+        """Test handling of nonexistent data directory."""
+        # Should not raise exception
+        _process_old_folders(
+            data_dir="/nonexistent/path",
+            folder_age_threshold_minutes=1,
+            active_tasks={},
+            hf_api=mock_hf_api,
+            hf_dataset_repo="test/repo",
+            hf_token="test_token",
+        )
+        # No uploads should occur
+        assert not mock_hf_api.upload_file.called
+    def test_process_folders_handles_bad_folder_names(self, temp_data_dir, mock_hf_api):
+        """Test handling of folders with unexpected name format."""
+        # Create folder with bad name format
+        bad_folder = Path(temp_data_dir) / "trace-invalid"  # Missing parts
+        bad_folder.mkdir()
+        old_time = time.time() - 3600
+        os.utime(bad_folder, (old_time, old_time))
+        # Should not raise exception
+        _process_old_folders(
+            data_dir=temp_data_dir,
+            folder_age_threshold_minutes=1,
+            active_tasks={},
+            hf_api=mock_hf_api,
+            hf_dataset_repo="test/repo",
+            hf_token="test_token",
+        )
+        # Folder should still exist (invalid name)
+        assert bad_folder.exists()
+class TestIntegration:
+    """Integration tests for the complete archival workflow."""
+    @pytest.mark.slow
+    def test_full_archival_workflow(self, temp_data_dir):
+        """Test the complete archival workflow end-to-end."""
+        # Create service
+        service = ArchivalService(
+            hf_token="test_token",
+            hf_dataset_repo="test/repo",
+            data_dir=temp_data_dir,
+            archive_interval_minutes=1,
+            folder_age_threshold_minutes=1,
+        )
+        # Create test folder
+        test_folder = Path(temp_data_dir) / "trace-integration-test-model"
+        test_folder.mkdir()
+        (test_folder / "test.json").write_text('{"test": "data"}')
+        # Make it old
+        old_time = time.time() - 3600
+        os.utime(test_folder, (old_time, old_time))
+        # Start service (mocked to prevent actual HF upload)
+        with (
+            patch(
+                "cua2_core.services.archival_service._upload_to_huggingface",
+                return_value=True,
+            ),
+            patch(
+                "cua2_core.services.archival_service._verify_file_in_repo",
+                return_value=True,
+            ),
+        ):
+            service.start()
+            time.sleep(2)  # Wait for at least one cycle
+            service.stop(timeout=5.0)
+        assert not service.is_alive()
+    def test_service_survives_worker_errors(self, temp_data_dir):
+        """Test that service continues running despite worker errors."""
+        service = ArchivalService(
+            hf_token="test_token",
+            hf_dataset_repo="test/repo",
+            data_dir=temp_data_dir,
+            archive_interval_minutes=1,
+            folder_age_threshold_minutes=1,
+        )
+        # Mock to raise exception
+        with patch(
+            "cua2_core.services.archival_service._process_old_folders",
+            side_effect=Exception("Test error"),
+        ):
+            service.start()
+            time.sleep(2)
+            # Service should still be alive
+            assert service.is_alive()
+            service.stop(timeout=5.0)
+if __name__ == "__main__":
+    pytest.main([__file__, "-v", "--tb=short"])

cua2-front/src/App.tsx CHANGED Viewed

@@ -15,9 +15,10 @@ const App = () => {
   // Initialize WebSocket connection at app level so it persists across route changes
   const { stopCurrentTask } = useAgentWebSocket({ url: getWebSocketUrl() });
-  // Store stopCurrentTask in window for global access
   (window as Window & { __stopCurrentTask?: () => void }).__stopCurrentTask = stopCurrentTask;
   return (
     <ThemeProvider theme={theme}>
       <CssBaseline />

   // Initialize WebSocket connection at app level so it persists across route changes
   const { stopCurrentTask } = useAgentWebSocket({ url: getWebSocketUrl() });
+  // Store functions in window for global access
   (window as Window & { __stopCurrentTask?: () => void }).__stopCurrentTask = stopCurrentTask;
   return (
     <ThemeProvider theme={theme}>
       <CssBaseline />

cua2-front/src/components/sandbox/completionview/CompletionView.tsx CHANGED Viewed

@@ -1,18 +1,18 @@
-import React from 'react';
-import { Box, Typography, Button, Divider, Alert, Paper } from '@mui/material';
-import CheckIcon from '@mui/icons-material/Check';
-import CloseIcon from '@mui/icons-material/Close';
-import StopCircleIcon from '@mui/icons-material/StopCircle';
-import HourglassEmptyIcon from '@mui/icons-material/HourglassEmpty';
 import AddIcon from '@mui/icons-material/Add';
-import SmartToyIcon from '@mui/icons-material/SmartToy';
 import AssignmentIcon from '@mui/icons-material/Assignment';
 import ChatBubbleOutlineIcon from '@mui/icons-material/ChatBubbleOutline';
-import AccessTimeIcon from '@mui/icons-material/AccessTime';
 import InputIcon from '@mui/icons-material/Input';
 import OutputIcon from '@mui/icons-material/Output';
-import FormatListNumberedIcon from '@mui/icons-material/FormatListNumbered';
-import { FinalStep, AgentTrace, AgentStep } from '@/types/agent';
 import { DownloadGifButton } from './DownloadGifButton';
 import { DownloadJsonButton } from './DownloadJsonButton';
@@ -65,14 +65,14 @@ export const CompletionView: React.FC<CompletionViewProps> = ({
       case 'sandbox_timeout':
         return {
           icon: <AccessTimeIcon sx={{ fontSize: 28 }} />,
-          title: 'Sandbox Timeout',
           color: 'error.main',
         };
       case 'failure':
       default:
         return {
           icon: <CloseIcon sx={{ fontSize: 28 }} />,
-          title: 'Task Failed',
           color: 'error.main',
         };
     }

+import { AgentStep, AgentTrace, FinalStep } from '@/types/agent';
+import AccessTimeIcon from '@mui/icons-material/AccessTime';
 import AddIcon from '@mui/icons-material/Add';
 import AssignmentIcon from '@mui/icons-material/Assignment';
 import ChatBubbleOutlineIcon from '@mui/icons-material/ChatBubbleOutline';
+import CheckIcon from '@mui/icons-material/Check';
+import CloseIcon from '@mui/icons-material/Close';
+import FormatListNumberedIcon from '@mui/icons-material/FormatListNumbered';
+import HourglassEmptyIcon from '@mui/icons-material/HourglassEmpty';
 import InputIcon from '@mui/icons-material/Input';
 import OutputIcon from '@mui/icons-material/Output';
+import SmartToyIcon from '@mui/icons-material/SmartToy';
+import StopCircleIcon from '@mui/icons-material/StopCircle';
+import { Alert, Box, Button, Divider, Paper, Typography } from '@mui/material';
+import React from 'react';
 import { DownloadGifButton } from './DownloadGifButton';
 import { DownloadJsonButton } from './DownloadJsonButton';
       case 'sandbox_timeout':
         return {
           icon: <AccessTimeIcon sx={{ fontSize: 28 }} />,
+          title: 'Max Sandbox Time Reached',
           color: 'error.main',
         };
       case 'failure':
       default:
         return {
           icon: <CloseIcon sx={{ fontSize: 28 }} />,
+          title: 'Task Failed (Agent Internal Error)',
           color: 'error.main',
         };
     }

cua2-front/src/hooks/useAgentWebSocket.ts CHANGED Viewed

@@ -1,8 +1,7 @@
 import { useCallback, useEffect } from 'react';
 import { useWebSocket } from './useWebSocket';
-import { useAgentStore } from '@/stores/agentStore';
-import { WebSocketEvent, AgentTrace, AgentStep } from '@/types/agent';
-import { ulid } from 'ulid';
 interface UseAgentWebSocketOptions {
   url: string;
@@ -11,6 +10,8 @@ interface UseAgentWebSocketOptions {
 export const useAgentWebSocket = ({ url }: UseAgentWebSocketOptions) => {
   const {
     setTrace,
     updateTraceWithStep,
     completeTrace,
     setIsAgentProcessing,
@@ -52,6 +53,7 @@ export const useAgentWebSocket = ({ url }: UseAgentWebSocketOptions) => {
               numberOfSteps: 0,
               maxSteps: 200,
               completed: false,
             },
           };
@@ -94,10 +96,13 @@ export const useAgentWebSocket = ({ url }: UseAgentWebSocketOptions) => {
         case 'heartbeat':
           console.log('Heartbeat received:', event);
           break;
       }
     },
-    [setTrace, updateTraceWithStep, completeTrace, setIsAgentProcessing, setIsConnectingToE2B, setVncUrl, setError, resetAgent]
   );
   // Handle WebSocket errors
@@ -113,10 +118,16 @@ export const useAgentWebSocket = ({ url }: UseAgentWebSocketOptions) => {
     onError: handleWebSocketError,
   });
-  // Sync connection state to store
   useEffect(() => {
     setIsConnected(isConnected);
-  }, [isConnected, setIsConnected]);
   // Create a global sendNewTask function that can be called from anywhere
   useEffect(() => {
@@ -125,7 +136,13 @@ export const useAgentWebSocket = ({ url }: UseAgentWebSocketOptions) => {
       // Reset agent state before starting a new task
       resetAgent();
-      const traceId = ulid();
       const trace: AgentTrace = {
         id: traceId,
         instruction,
@@ -140,7 +157,8 @@ export const useAgentWebSocket = ({ url }: UseAgentWebSocketOptions) => {
           numberOfSteps: 0,
           maxSteps: 200, // Default max steps, will be updated by backend
           completed: false,
-        },
       };
       setTrace(trace);
@@ -155,7 +173,7 @@ export const useAgentWebSocket = ({ url }: UseAgentWebSocketOptions) => {
       console.log('Task sent:', trace);
     };
-  }, [setTrace, setIsAgentProcessing, setIsConnectingToE2B, sendMessage, resetAgent]);
   // Function to stop the current task
   const stopCurrentTask = useCallback(() => {

+import { useAgentStore } from '@/stores/agentStore';
+import { AgentTrace, AgentTraceMetadata, WebSocketEvent } from '@/types/agent';
 import { useCallback, useEffect } from 'react';
 import { useWebSocket } from './useWebSocket';
 interface UseAgentWebSocketOptions {
   url: string;
 export const useAgentWebSocket = ({ url }: UseAgentWebSocketOptions) => {
   const {
     setTrace,
+    traceId,
+    setTraceId,
     updateTraceWithStep,
     completeTrace,
     setIsAgentProcessing,
               numberOfSteps: 0,
               maxSteps: 200,
               completed: false,
+              final_state: null,
             },
           };
         case 'heartbeat':
           console.log('Heartbeat received:', event);
+          setTraceId(event.uuid);
+          console.log('TraceId set from backend:', event.uuid);
           break;
       }
     },
+    [setTrace, updateTraceWithStep, completeTrace, setIsAgentProcessing, setIsConnectingToE2B, setVncUrl, setError, resetAgent, setTraceId, traceId]
   );
   // Handle WebSocket errors
     onError: handleWebSocketError,
   });
+  // Sync connection state to store and clear traceId on disconnect
   useEffect(() => {
     setIsConnected(isConnected);
+    // Clear traceId when websocket disconnects
+    if (!isConnected) {
+      setTraceId(null);
+      console.log('WebSocket disconnected - traceId cleared');
+    }
+  }, [isConnected, setIsConnected, setTraceId]);
   // Create a global sendNewTask function that can be called from anywhere
   useEffect(() => {
       // Reset agent state before starting a new task
       resetAgent();
+      // Ensure traceId is set before creating trace
+      if (!traceId) {
+        console.error('Internal error: Cannot send task. TraceId not set. Refreshing page...');
+        window.location.reload();
+        return;
+      }
       const trace: AgentTrace = {
         id: traceId,
         instruction,
           numberOfSteps: 0,
           maxSteps: 200, // Default max steps, will be updated by backend
           completed: false,
+          final_state: null,
+        } as AgentTraceMetadata,
       };
       setTrace(trace);
       console.log('Task sent:', trace);
     };
+  }, [setTrace, setIsAgentProcessing, setIsConnectingToE2B, sendMessage, resetAgent, traceId]);
   // Function to stop the current task
   const stopCurrentTask = useCallback(() => {

cua2-front/src/pages/Index.tsx DELETED Viewed

@@ -1,147 +0,0 @@
-import { Header, Metadata, StackSteps, VNCStream } from '@/components/mock';
-import { getWebSocketUrl } from '@/config/api';
-import { useWebSocket } from '@/hooks/useWebSocket';
-import { AgentStep, AgentTrace, WebSocketEvent } from '@/types/agent';
-import { useState } from 'react';
-import { ulid } from 'ulid';
-const Index = () => {
-  const [trace, setTrace] = useState<AgentTrace>();
-  const [isAgentProcessing, setIsAgentProcessing] = useState(false);
-  const [vncUrl, setVncUrl] = useState<string>('');
-  const [selectedModelId, setSelectedModelId] = useState<string>("Qwen/Qwen3-VL-30B-A3B-Instruct");
-  // #################### WebSocket Connection ########################
-  // WebSocket connection - Automatically configured based on environment
-  const WS_URL = getWebSocketUrl();
-  const handleWebSocketMessage = (event: WebSocketEvent) => {
-    console.log('WebSocket event received:', event);
-    switch (event.type) {
-      case 'agent_start':
-        setIsAgentProcessing(true);
-        setTrace(event.agentTrace);
-        console.log('Agent start received:', event.agentTrace);
-        break;
-      case 'agent_progress':
-        // Add new step from a agent trace run with image, generated text, actions, tokens and timestamp
-        setTrace(prev => {
-          const existingSteps = prev?.steps || [] as AgentStep[];
-          const stepExists = existingSteps.some(step => step.stepId === event.agentStep.stepId);
-          if (!stepExists) {
-            return {
-              ...prev,
-              steps: [...existingSteps, event.agentStep],
-              traceMetadata: event.traceMetadata,
-              isRunning: true
-            };
-          }
-          return prev;
-        });
-        console.log('Agent progress received:', event.agentStep);
-        break;
-      case 'agent_complete':
-        setIsAgentProcessing(false);
-        setTrace(trace => {
-          return trace.id === event.traceMetadata.traceId
-            ? {
-              ...trace,
-              isRunning: false,
-              metadata: event.traceMetadata,
-            }
-            : trace;
-        });
-        console.log('Agent complete received:', event.traceMetadata);
-        break;
-      case 'agent_error':
-        setIsAgentProcessing(false);
-        // TODO: Handle agent error
-        console.log('Agent error received:', event.error);
-        break;
-      case 'vnc_url_set':
-        setVncUrl(event.vncUrl);
-        // TODO: Handle VNC URL set
-        console.log('VNC URL set received:', event.vncUrl);
-        break;
-      case 'vnc_url_unset':
-        setVncUrl('');
-        // TODO: Handle VNC URL unset
-        console.log('VNC URL unset received:');
-        break;
-      case 'heartbeat':
-        console.log('Heartbeat received:', event);
-        break;
-    }
-  };
-  const handleWebSocketError = () => {
-    // WebSocket Frontend Error handling
-  };
-  const { isConnected, connectionState, sendMessage, manualReconnect } = useWebSocket({
-    url: WS_URL,
-    onMessage: handleWebSocketMessage,
-    onError: handleWebSocketError,
-  });
-  // #################### Frontend Functionality ########################
-  const handleModelId = (modelId: string) => {
-    setSelectedModelId(modelId);
-  };
-  const handleSendNewTask = (content: string, modelId: string) => {
-    const trace: AgentTrace = {
-      id: ulid(),
-      instruction: content,
-      modelId: selectedModelId,
-      timestamp: new Date(),
-      isRunning: true,
-    };
-    setTrace(trace);
-    // Send message to Python backend via WebSocket
-    sendMessage({
-      type: 'user_task',
-      trace: trace,
-    });
-  };
-  // #################### Mock Frontend Rendering ########################
-  return (
-    <div style={{ height: '100%', width: '100%', display: 'flex', flexDirection: 'column', backgroundColor: '#f3f4f6' }}>
-      <Header
-        isConnected={isConnected}
-        isAgentProcessing={isAgentProcessing}
-        onSendTask={handleSendNewTask}
-      />
-      <div style={{ flex: 1, display: 'flex', justifyContent: 'center', alignItems: 'center', overflow: 'hidden', minHeight: 0, padding: '32px' }}>
-        <div style={{ width: '100%', height: '100%', maxWidth: '1400px', maxHeight: '900px', display: 'flex', flexDirection: 'row', overflow: 'hidden' }}>
-          {/* Left Side: VNC Stream + Metadata */}
-          <div style={{ flex: 1, display: 'flex', flexDirection: 'column', padding: '20px 12px', gap: '20px', minWidth: 0 }}>
-            <VNCStream vncUrl={vncUrl} />
-            <Metadata trace={trace} />
-          </div>
-          {/* Right Side: Stack Steps */}
-          <StackSteps trace={trace} />
-        </div>
-      </div>
-    </div>
-  );
-};
-export default Index;

cua2-front/src/pages/Task.tsx CHANGED Viewed

@@ -1,8 +1,8 @@
-import React, { useEffect } from 'react';
-import { useNavigate } from 'react-router-dom';
-import { useAgentStore, selectTrace, selectIsAgentProcessing, selectVncUrl, selectMetadata, selectSelectedStep } from '@/stores/agentStore';
 import { Header, SandboxViewer, StepsList, Timeline } from '@/components';
 import { Box } from '@mui/material';
 const Task = () => {
   const navigate = useNavigate();
@@ -25,8 +25,19 @@ const Task = () => {
   // Handler for going back to home
   const handleBackToHome = () => {
     useAgentStore.getState().resetAgent();
-    navigate('/');
   };
   // Determine if we should show success/fail status (same logic as SandboxViewer)
@@ -66,15 +77,15 @@ const Task = () => {
           overflowX: 'hidden',
         }}
       >
-      <Box
-        sx={{
-          width: '100%',
-          display: 'flex',
-          flexDirection: { xs: 'column', md: 'row' },
-          p: { xs: 2, md: 4 },
-          pb: { xs: 2, md: 3 },
-        }}
-      >
           {/* Left Side: OS Stream + Metadata */}
           <Box
             sx={{

 import { Header, SandboxViewer, StepsList, Timeline } from '@/components';
+import { selectIsAgentProcessing, selectMetadata, selectSelectedStep, selectTrace, selectVncUrl, useAgentStore } from '@/stores/agentStore';
 import { Box } from '@mui/material';
+import { useEffect } from 'react';
+import { useNavigate } from 'react-router-dom';
 const Task = () => {
   const navigate = useNavigate();
   // Handler for going back to home
   const handleBackToHome = () => {
+    const currentTrace = useAgentStore.getState().trace;
+    // Stop the current task if it's running
+    const stopTask = (window as Window & { __stopCurrentTask?: () => void }).__stopCurrentTask;
+    if (stopTask) {
+      stopTask();
+    }
+    // Reset frontend state
     useAgentStore.getState().resetAgent();
+    // Reload the page to reconnect websocket
+    window.location.href = '/';
   };
   // Determine if we should show success/fail status (same logic as SandboxViewer)
           overflowX: 'hidden',
         }}
       >
+        <Box
+          sx={{
+            width: '100%',
+            display: 'flex',
+            flexDirection: { xs: 'column', md: 'row' },
+            p: { xs: 2, md: 4 },
+            pb: { xs: 2, md: 3 },
+          }}
+        >
           {/* Left Side: OS Stream + Metadata */}
           <Box
             sx={{

cua2-front/src/stores/agentStore.ts CHANGED Viewed

@@ -5,6 +5,7 @@ import { devtools } from 'zustand/middleware';
 interface AgentState {
   // State
   trace?: AgentTrace;
   isAgentProcessing: boolean;
   isConnectingToE2B: boolean; // New state for E2B connection
   vncUrl: string;
@@ -19,6 +20,7 @@ interface AgentState {
   // Actions
   setTrace: (trace: AgentTrace | undefined) => void;
   updateTraceWithStep: (step: AgentStep, metadata: AgentTraceMetadata) => void;
   completeTrace: (metadata: AgentTraceMetadata, finalState?: 'success' | 'stopped' | 'max_steps_reached' | 'error' | 'sandbox_timeout') => void;
   setIsAgentProcessing: (processing: boolean) => void;
@@ -36,6 +38,7 @@ interface AgentState {
 const initialState = {
   trace: undefined,
   isAgentProcessing: false,
   isConnectingToE2B: false,
   vncUrl: '',
@@ -58,6 +61,10 @@ export const useAgentStore = create<AgentState>()(
       setTrace: (trace) =>
         set({ trace }, false, 'setTrace'),
       // Update trace with a new step
       updateTraceWithStep: (step, metadata) =>
         set(
@@ -90,62 +97,62 @@ export const useAgentStore = create<AgentState>()(
           'updateTraceWithStep'
         ),
-  // Complete the trace
-  completeTrace: (metadata, finalState?: 'success' | 'stopped' | 'max_steps_reached' | 'error' | 'sandbox_timeout') =>
-    set(
-      (state) => {
-        if (!state.trace) return state;
-        // Preserve existing maxSteps if new metadata has 0
-        const updatedMetadata = {
-          ...metadata,
-          maxSteps: metadata.maxSteps > 0
-            ? metadata.maxSteps
-            : (state.trace.traceMetadata?.maxSteps || 200),
-          completed: true,
-        };
-        // Determine the final step type based on final_state from backend
-        let stepType: 'success' | 'failure' | 'stopped' | 'max_steps_reached' | 'sandbox_timeout';
-        let stepMessage: string | undefined;
-        if (finalState === 'stopped') {
-          stepType = 'stopped';
-          stepMessage = 'Task stopped by user';
-        } else if (finalState === 'max_steps_reached') {
-          stepType = 'max_steps_reached';
-          stepMessage = 'Maximum steps reached';
-        } else if (finalState === 'sandbox_timeout') {
-          stepType = 'sandbox_timeout';
-          stepMessage = 'Sandbox timeout';
-        } else if (finalState === 'error' || state.error) {
-          stepType = 'failure';
-          stepMessage = state.error || 'Task failed';
-        } else {
-          stepType = 'success';
-          stepMessage = undefined;
-        }
-        const finalStep: FinalStep = {
-          type: stepType,
-          message: stepMessage,
-          metadata: updatedMetadata,
-        };
-        return {
-          trace: {
-            ...state.trace,
-            isRunning: false,
-            traceMetadata: updatedMetadata,
           },
-          finalStep,
-          // Keep error in state for display
-          selectedStepIndex: null, // Reset to live mode on completion
-        };
-      },
-      false,
-      'completeTrace'
-    ),
       // Set processing state
       setIsAgentProcessing: (isAgentProcessing) =>
@@ -227,10 +234,11 @@ export const useAgentStore = create<AgentState>()(
       toggleDarkMode: () =>
         set((state) => ({ isDarkMode: !state.isDarkMode }), false, 'toggleDarkMode'),
-      // Reset agent state
       resetAgent: () =>
         set((state) => ({
           ...initialState,
           isDarkMode: state.isDarkMode,  // Keep dark mode preference
           isConnected: state.isConnected,  // Keep connection status
           selectedModelId: state.selectedModelId,  // Keep selected model
@@ -244,6 +252,7 @@ export const useAgentStore = create<AgentState>()(
 // Selectors for better performance
 export const selectTrace = (state: AgentState) => state.trace;
 export const selectIsAgentProcessing = (state: AgentState) => state.isAgentProcessing;
 export const selectIsConnectingToE2B = (state: AgentState) => state.isConnectingToE2B;
 export const selectVncUrl = (state: AgentState) => state.vncUrl;

 interface AgentState {
   // State
   trace?: AgentTrace;
+  traceId: string | null; // Set by backend heartbeat, persists during connection
   isAgentProcessing: boolean;
   isConnectingToE2B: boolean; // New state for E2B connection
   vncUrl: string;
   // Actions
   setTrace: (trace: AgentTrace | undefined) => void;
+  setTraceId: (traceId: string | null) => void;
   updateTraceWithStep: (step: AgentStep, metadata: AgentTraceMetadata) => void;
   completeTrace: (metadata: AgentTraceMetadata, finalState?: 'success' | 'stopped' | 'max_steps_reached' | 'error' | 'sandbox_timeout') => void;
   setIsAgentProcessing: (processing: boolean) => void;
 const initialState = {
   trace: undefined,
+  traceId: null, // Will be set by backend heartbeat
   isAgentProcessing: false,
   isConnectingToE2B: false,
   vncUrl: '',
       setTrace: (trace) =>
         set({ trace }, false, 'setTrace'),
+      // Set trace ID (set by backend heartbeat, only cleared on disconnect)
+      setTraceId: (traceId) =>
+        set({ traceId }, false, 'setTraceId'),
       // Update trace with a new step
       updateTraceWithStep: (step, metadata) =>
         set(
           'updateTraceWithStep'
         ),
+      // Complete the trace
+      completeTrace: (metadata, finalState?: 'success' | 'stopped' | 'max_steps_reached' | 'error' | 'sandbox_timeout') =>
+        set(
+          (state) => {
+            if (!state.trace) return state;
+            // Preserve existing maxSteps if new metadata has 0
+            const updatedMetadata = {
+              ...metadata,
+              maxSteps: metadata.maxSteps > 0
+                ? metadata.maxSteps
+                : (state.trace.traceMetadata?.maxSteps || 200),
+              completed: true,
+            };
+            // Determine the final step type based on final_state from backend
+            let stepType: 'success' | 'failure' | 'stopped' | 'max_steps_reached' | 'sandbox_timeout';
+            let stepMessage: string | undefined;
+            if (finalState === 'stopped') {
+              stepType = 'stopped';
+              stepMessage = 'Task stopped by user';
+            } else if (finalState === 'max_steps_reached') {
+              stepType = 'max_steps_reached';
+              stepMessage = 'Maximum steps reached';
+            } else if (finalState === 'sandbox_timeout') {
+              stepType = 'sandbox_timeout';
+              stepMessage = 'Sandbox timeout';
+            } else if (finalState === 'error' || state.error) {
+              stepType = 'failure';
+              stepMessage = state.error || 'Task failed';
+            } else {
+              stepType = 'success';
+              stepMessage = undefined;
+            }
+            const finalStep: FinalStep = {
+              type: stepType,
+              message: stepMessage,
+              metadata: updatedMetadata,
+            };
+            return {
+              trace: {
+                ...state.trace,
+                isRunning: false,
+                traceMetadata: updatedMetadata,
+              },
+              finalStep,
+              // Keep error in state for display
+              selectedStepIndex: null, // Reset to live mode on completion
+            };
           },
+          false,
+          'completeTrace'
+        ),
       // Set processing state
       setIsAgentProcessing: (isAgentProcessing) =>
       toggleDarkMode: () =>
         set((state) => ({ isDarkMode: !state.isDarkMode }), false, 'toggleDarkMode'),
+      // Reset agent state (but preserve traceId from backend during connection)
       resetAgent: () =>
         set((state) => ({
           ...initialState,
+          traceId: state.traceId,  // IMPORTANT: Keep traceId from backend
           isDarkMode: state.isDarkMode,  // Keep dark mode preference
           isConnected: state.isConnected,  // Keep connection status
           selectedModelId: state.selectedModelId,  // Keep selected model
 // Selectors for better performance
 export const selectTrace = (state: AgentState) => state.trace;
+export const selectTraceId = (state: AgentState) => state.traceId;
 export const selectIsAgentProcessing = (state: AgentState) => state.isAgentProcessing;
 export const selectIsConnectingToE2B = (state: AgentState) => state.isConnectingToE2B;
 export const selectVncUrl = (state: AgentState) => state.vncUrl;

cua2-front/src/types/agent.ts CHANGED Viewed

@@ -80,6 +80,7 @@ interface VncUrlUnsetEvent {
 interface HeartbeatEvent {
   type: 'heartbeat';
 }
 export type WebSocketEvent =

 interface HeartbeatEvent {
   type: 'heartbeat';
+  uuid: string;
 }
 export type WebSocketEvent =