Spaces:

NeerajCodz
/

scrapeRL

Running

App Files Files Community

NeerajCodz commited on 28 days ago

Commit

27cde0c

1 Parent(s): afefaea

feat: add API routes and utility modules

Browse files

Files changed (24) hide show

backend/app/api/__init__.py +5 -0
backend/app/api/__pycache__/__init__.cpython-314.pyc +0 -0
backend/app/api/__pycache__/deps.cpython-314.pyc +0 -0
backend/app/api/deps.py +118 -0
backend/app/api/routes/__init__.py +5 -0
backend/app/api/routes/__pycache__/__init__.cpython-314.pyc +0 -0
backend/app/api/routes/__pycache__/agents.cpython-314.pyc +0 -0
backend/app/api/routes/__pycache__/episode.cpython-314.pyc +0 -0
backend/app/api/routes/__pycache__/health.cpython-314.pyc +0 -0
backend/app/api/routes/__pycache__/memory.cpython-314.pyc +0 -0
backend/app/api/routes/__pycache__/tasks.cpython-314.pyc +0 -0
backend/app/api/routes/__pycache__/tools.cpython-314.pyc +0 -0
backend/app/api/routes/agents.py +301 -0
backend/app/api/routes/episode.py +235 -0
backend/app/api/routes/health.py +142 -0
backend/app/api/routes/memory.py +344 -0
backend/app/api/routes/tasks.py +254 -0
backend/app/api/routes/tools.py +333 -0
backend/app/utils/__init__.py +22 -0
backend/app/utils/__pycache__/__init__.cpython-314.pyc +0 -0
backend/app/utils/__pycache__/html.cpython-314.pyc +0 -0
backend/app/utils/__pycache__/logging.cpython-314.pyc +0 -0
backend/app/utils/html.py +286 -0
backend/app/utils/logging.py +54 -0

backend/app/api/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+"""API package initialization."""
+from app.api.routes import agents, episode, health, memory, tasks, tools
+__all__ = ["agents", "episode", "health", "memory", "tasks", "tools"]

backend/app/api/__pycache__/__init__.cpython-314.pyc ADDED Viewed

Binary file (349 Bytes). View file

backend/app/api/__pycache__/deps.cpython-314.pyc ADDED Viewed

Binary file (7.7 kB). View file

backend/app/api/deps.py ADDED Viewed

	@@ -0,0 +1,118 @@

+"""Dependency injection utilities for API routes."""
+from typing import Annotated, Generator
+from fastapi import Depends, HTTPException, status
+from app.config import Settings, get_settings
+from app.core.env import WebScraperEnv
+from app.memory.manager import MemoryManager
+from app.models.router import SmartModelRouter
+from app.tools.registry import MCPToolRegistry
+# Settings dependency
+SettingsDep = Annotated[Settings, Depends(get_settings)]
+# Store for active environments
+_active_environments: dict[str, WebScraperEnv] = {}
+def get_environment(episode_id: str) -> WebScraperEnv:
+    """Get an active environment by episode ID."""
+    if episode_id not in _active_environments:
+        raise HTTPException(
+            status_code=status.HTTP_404_NOT_FOUND,
+            detail=f"Episode {episode_id} not found",
+        )
+    return _active_environments[episode_id]
+def create_environment(episode_id: str, settings: Settings) -> WebScraperEnv:
+    """Create a new environment for an episode."""
+    if episode_id in _active_environments:
+        raise HTTPException(
+            status_code=status.HTTP_409_CONFLICT,
+            detail=f"Episode {episode_id} already exists",
+        )
+    env = WebScraperEnv(episode_id=episode_id, settings=settings)
+    _active_environments[episode_id] = env
+    return env
+def remove_environment(episode_id: str) -> bool:
+    """Remove an environment from active storage."""
+    if episode_id in _active_environments:
+        del _active_environments[episode_id]
+        return True
+    return False
+def list_environments() -> list[str]:
+    """List all active episode IDs."""
+    return list(_active_environments.keys())
+class DependencyContainer:
+    """Container for dependency injection across the application."""
+    def __init__(self) -> None:
+        self._memory_manager: MemoryManager | None = None
+        self._model_router: SmartModelRouter | None = None
+        self._tool_registry: MCPToolRegistry | None = None
+    def set_memory_manager(self, manager: MemoryManager) -> None:
+        """Set the memory manager instance."""
+        self._memory_manager = manager
+    def set_model_router(self, router: SmartModelRouter) -> None:
+        """Set the model router instance."""
+        self._model_router = router
+    def set_tool_registry(self, registry: MCPToolRegistry) -> None:
+        """Set the tool registry instance."""
+        self._tool_registry = registry
+    def get_memory_manager(self) -> MemoryManager:
+        """Get the memory manager instance."""
+        if self._memory_manager is None:
+            raise RuntimeError("Memory manager not initialized")
+        return self._memory_manager
+    def get_model_router(self) -> SmartModelRouter:
+        """Get the model router instance."""
+        if self._model_router is None:
+            raise RuntimeError("Model router not initialized")
+        return self._model_router
+    def get_tool_registry(self) -> MCPToolRegistry:
+        """Get the tool registry instance."""
+        if self._tool_registry is None:
+            raise RuntimeError("Tool registry not initialized")
+        return self._tool_registry
+# Global container instance
+container = DependencyContainer()
+def get_memory_manager() -> MemoryManager:
+    """Dependency for memory manager."""
+    return container.get_memory_manager()
+def get_model_router() -> SmartModelRouter:
+    """Dependency for model router."""
+    return container.get_model_router()
+def get_tool_registry() -> MCPToolRegistry:
+    """Dependency for tool registry."""
+    return container.get_tool_registry()
+# Type aliases for dependency injection
+MemoryManagerDep = Annotated[MemoryManager, Depends(get_memory_manager)]
+ModelRouterDep = Annotated[SmartModelRouter, Depends(get_model_router)]
+ToolRegistryDep = Annotated[MCPToolRegistry, Depends(get_tool_registry)]

backend/app/api/routes/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+"""API routes package."""
+from app.api.routes import agents, episode, health, memory, tasks, tools
+__all__ = ["agents", "episode", "health", "memory", "tasks", "tools"]

backend/app/api/routes/__pycache__/__init__.cpython-314.pyc ADDED Viewed

Binary file (348 Bytes). View file

backend/app/api/routes/__pycache__/agents.cpython-314.pyc ADDED Viewed

Binary file (14.7 kB). View file

backend/app/api/routes/__pycache__/episode.cpython-314.pyc ADDED Viewed

Binary file (11.2 kB). View file

backend/app/api/routes/__pycache__/health.cpython-314.pyc ADDED Viewed

Binary file (6.35 kB). View file

backend/app/api/routes/__pycache__/memory.cpython-314.pyc ADDED Viewed

Binary file (16.5 kB). View file

backend/app/api/routes/__pycache__/tasks.cpython-314.pyc ADDED Viewed

Binary file (11.3 kB). View file

backend/app/api/routes/__pycache__/tools.cpython-314.pyc ADDED Viewed

Binary file (13.1 kB). View file

backend/app/api/routes/agents.py ADDED Viewed

	@@ -0,0 +1,301 @@

+"""Agent management endpoints."""
+import logging
+from enum import Enum
+from typing import Any
+from uuid import uuid4
+from fastapi import APIRouter, HTTPException, status
+from pydantic import BaseModel, Field
+router = APIRouter(prefix="/agents")
+logger = logging.getLogger(__name__)
+class AgentType(str, Enum):
+    """Types of agents in the system."""
+    PLANNER = "planner"
+    NAVIGATOR = "navigator"
+    EXTRACTOR = "extractor"
+    VERIFIER = "verifier"
+    MEMORY = "memory"
+    COORDINATOR = "coordinator"
+class AgentStatus(str, Enum):
+    """Agent execution status."""
+    IDLE = "idle"
+    PLANNING = "planning"
+    EXECUTING = "executing"
+    WAITING = "waiting"
+    COMPLETED = "completed"
+    FAILED = "failed"
+class AgentRunRequest(BaseModel):
+    """Request to run an agent."""
+    agent_type: AgentType
+    episode_id: str
+    task_context: dict[str, Any] = Field(default_factory=dict)
+    observation: dict[str, Any] | None = None
+    config: dict[str, Any] = Field(default_factory=dict)
+class AgentRunResponse(BaseModel):
+    """Response from agent execution."""
+    run_id: str
+    agent_type: AgentType
+    status: AgentStatus
+    action: dict[str, Any] | None = None
+    reasoning: str | None = None
+    confidence: float | None = None
+    tokens_used: int = 0
+    execution_time_ms: float = 0.0
+class PlanRequest(BaseModel):
+    """Request for creating a plan."""
+    episode_id: str
+    task_description: str
+    current_state: dict[str, Any] = Field(default_factory=dict)
+    constraints: list[str] = Field(default_factory=list)
+class PlanStep(BaseModel):
+    """A single step in a plan."""
+    step_number: int
+    action_type: str
+    description: str
+    agent: AgentType
+    dependencies: list[int] = Field(default_factory=list)
+    estimated_cost: float = 0.0
+class PlanResponse(BaseModel):
+    """Response containing a generated plan."""
+    plan_id: str
+    episode_id: str
+    steps: list[PlanStep]
+    total_estimated_steps: int
+    reasoning: str
+    confidence: float
+class AgentState(BaseModel):
+    """Current state of an agent."""
+    agent_id: str
+    agent_type: AgentType
+    status: AgentStatus
+    current_task: str | None = None
+    messages_processed: int = 0
+    actions_taken: int = 0
+    last_action: dict[str, Any] | None = None
+    memory_snapshot: dict[str, Any] = Field(default_factory=dict)
+# Store for agent states
+_agent_states: dict[str, AgentState] = {}
+@router.post(
+    "/run",
+    response_model=AgentRunResponse,
+    status_code=status.HTTP_200_OK,
+    summary="Run an agent",
+    description="Execute an agent to produce an action",
+)
+async def run_agent(request: AgentRunRequest) -> AgentRunResponse:
+    """
+    Run an agent to produce an action for the current observation.
+    Args:
+        request: Agent run configuration.
+    Returns:
+        AgentRunResponse: Result of agent execution.
+    """
+    run_id = str(uuid4())
+    logger.info(f"Running {request.agent_type} agent for episode {request.episode_id}")
+    try:
+        # Import and instantiate the appropriate agent
+        from app.agents.coordinator import AgentCoordinator
+        coordinator = AgentCoordinator()
+        result = await coordinator.run_agent(
+            agent_type=request.agent_type,
+            episode_id=request.episode_id,
+            observation=request.observation,
+            config=request.config,
+        )
+        return AgentRunResponse(
+            run_id=run_id,
+            agent_type=request.agent_type,
+            status=AgentStatus.COMPLETED,
+            action=result.get("action"),
+            reasoning=result.get("reasoning"),
+            confidence=result.get("confidence"),
+            tokens_used=result.get("tokens_used", 0),
+            execution_time_ms=result.get("execution_time_ms", 0.0),
+        )
+    except Exception as e:
+        logger.error(f"Agent execution failed: {e}")
+        return AgentRunResponse(
+            run_id=run_id,
+            agent_type=request.agent_type,
+            status=AgentStatus.FAILED,
+            reasoning=str(e),
+        )
+@router.post(
+    "/plan",
+    response_model=PlanResponse,
+    status_code=status.HTTP_200_OK,
+    summary="Generate a plan",
+    description="Use the planner agent to generate an execution plan",
+)
+async def generate_plan(request: PlanRequest) -> PlanResponse:
+    """
+    Generate a plan for completing a task.
+    Args:
+        request: Planning request with task details.
+    Returns:
+        PlanResponse: Generated plan with steps.
+    """
+    plan_id = str(uuid4())
+    logger.info(f"Generating plan for episode {request.episode_id}")
+    try:
+        from app.agents.planner import PlannerAgent
+        planner = PlannerAgent()
+        plan_result = await planner.create_plan(
+            task_description=request.task_description,
+            current_state=request.current_state,
+            constraints=request.constraints,
+        )
+        steps = [
+            PlanStep(
+                step_number=i + 1,
+                action_type=step["action_type"],
+                description=step["description"],
+                agent=AgentType(step["agent"]),
+                dependencies=step.get("dependencies", []),
+                estimated_cost=step.get("estimated_cost", 0.0),
+            )
+            for i, step in enumerate(plan_result["steps"])
+        ]
+        return PlanResponse(
+            plan_id=plan_id,
+            episode_id=request.episode_id,
+            steps=steps,
+            total_estimated_steps=len(steps),
+            reasoning=plan_result.get("reasoning", ""),
+            confidence=plan_result.get("confidence", 0.8),
+        )
+    except Exception as e:
+        logger.error(f"Plan generation failed: {e}")
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail=f"Failed to generate plan: {str(e)}",
+        )
+@router.get(
+    "/state/{agent_id}",
+    response_model=AgentState,
+    status_code=status.HTTP_200_OK,
+    summary="Get agent state",
+    description="Get the current state of an agent",
+)
+async def get_agent_state(agent_id: str) -> AgentState:
+    """
+    Get the current state of an agent.
+    Args:
+        agent_id: ID of the agent.
+    Returns:
+        AgentState: Current agent state.
+    """
+    if agent_id not in _agent_states:
+        raise HTTPException(
+            status_code=status.HTTP_404_NOT_FOUND,
+            detail=f"Agent {agent_id} not found",
+        )
+    return _agent_states[agent_id]
+@router.get(
+    "/types/",
+    status_code=status.HTTP_200_OK,
+    summary="Get agent types",
+    description="Get all available agent types",
+)
+async def get_agent_types() -> dict[str, list[dict[str, str]]]:
+    """
+    Get available agent types with descriptions.
+    Returns:
+        Dict with agent type information.
+    """
+    agent_info = [
+        {"type": AgentType.PLANNER.value, "description": "Creates execution plans for tasks"},
+        {"type": AgentType.NAVIGATOR.value, "description": "Handles page navigation and URL management"},
+        {"type": AgentType.EXTRACTOR.value, "description": "Extracts data from web pages"},
+        {"type": AgentType.VERIFIER.value, "description": "Validates extracted data"},
+        {"type": AgentType.MEMORY.value, "description": "Manages memory operations"},
+        {"type": AgentType.COORDINATOR.value, "description": "Orchestrates multi-agent collaboration"},
+    ]
+    return {"agents": agent_info}
+@router.post(
+    "/message",
+    status_code=status.HTTP_200_OK,
+    summary="Send inter-agent message",
+    description="Send a message between agents",
+)
+async def send_agent_message(
+    from_agent: str,
+    to_agent: str,
+    message_type: str,
+    content: dict[str, Any],
+) -> dict[str, Any]:
+    """
+    Send a message between agents.
+    Args:
+        from_agent: Source agent ID.
+        to_agent: Target agent ID.
+        message_type: Type of message.
+        content: Message content.
+    Returns:
+        Acknowledgment of message delivery.
+    """
+    message_id = str(uuid4())
+    logger.info(f"Message {message_id}: {from_agent} -> {to_agent} ({message_type})")
+    # In production, this would go through a message broker
+    return {
+        "message_id": message_id,
+        "status": "delivered",
+        "from": from_agent,
+        "to": to_agent,
+        "type": message_type,
+    }

backend/app/api/routes/episode.py ADDED Viewed

	@@ -0,0 +1,235 @@

+"""Episode management endpoints - reset, step, and state operations."""
+import logging
+from typing import Any
+from uuid import uuid4
+from fastapi import APIRouter, HTTPException, status
+from pydantic import BaseModel, Field
+from app.api.deps import SettingsDep, create_environment, get_environment, remove_environment, list_environments
+from app.core.action import Action, ActionType
+from app.core.observation import Observation
+router = APIRouter(prefix="/episode")
+logger = logging.getLogger(__name__)
+class ResetRequest(BaseModel):
+    """Request model for resetting an episode."""
+    task_id: str = Field(..., description="ID of the task to execute")
+    seed: int | None = Field(default=None, description="Random seed for reproducibility")
+    config: dict[str, Any] | None = Field(default=None, description="Episode configuration overrides")
+class ResetResponse(BaseModel):
+    """Response model for episode reset."""
+    episode_id: str
+    task_id: str
+    observation: Observation
+    info: dict[str, Any]
+class StepRequest(BaseModel):
+    """Request model for taking a step."""
+    episode_id: str = Field(..., description="ID of the episode")
+    action: Action = Field(..., description="Action to execute")
+class StepResponse(BaseModel):
+    """Response model for step execution."""
+    observation: Observation
+    reward: float
+    reward_breakdown: dict[str, float]
+    terminated: bool
+    truncated: bool
+    info: dict[str, Any]
+class EpisodeState(BaseModel):
+    """Current state of an episode."""
+    episode_id: str
+    task_id: str
+    step_number: int
+    current_url: str | None
+    is_terminal: bool
+    total_reward: float
+    extracted_data: dict[str, Any]
+class EpisodeListResponse(BaseModel):
+    """Response model for listing episodes."""
+    episodes: list[str]
+    count: int
+@router.post(
+    "/reset",
+    response_model=ResetResponse,
+    status_code=status.HTTP_201_CREATED,
+    summary="Reset/create new episode",
+    description="Create a new episode for a given task",
+)
+async def reset_episode(
+    request: ResetRequest,
+    settings: SettingsDep,
+) -> ResetResponse:
+    """
+    Reset and initialize a new episode.
+    Args:
+        request: Reset request containing task_id and optional seed.
+        settings: Application settings.
+    Returns:
+        ResetResponse: New episode ID and initial observation.
+    """
+    episode_id = str(uuid4())
+    logger.info(f"Creating new episode {episode_id} for task {request.task_id}")
+    try:
+        env = create_environment(episode_id, settings)
+        observation, info = await env.reset(
+            task_id=request.task_id,
+            seed=request.seed,
+            config=request.config,
+        )
+        return ResetResponse(
+            episode_id=episode_id,
+            task_id=request.task_id,
+            observation=observation,
+            info=info,
+        )
+    except Exception as e:
+        logger.error(f"Failed to reset episode: {e}")
+        remove_environment(episode_id)
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail=f"Failed to create episode: {str(e)}",
+        )
+@router.post(
+    "/step",
+    response_model=StepResponse,
+    status_code=status.HTTP_200_OK,
+    summary="Execute action step",
+    description="Execute an action in the episode and receive observation and reward",
+)
+async def step_episode(request: StepRequest) -> StepResponse:
+    """
+    Execute an action step in the episode.
+    Args:
+        request: Step request containing episode_id and action.
+    Returns:
+        StepResponse: New observation, reward, and termination status.
+    """
+    logger.info(f"Step in episode {request.episode_id}: {request.action.action_type}")
+    env = get_environment(request.episode_id)
+    try:
+        observation, reward, reward_breakdown, terminated, truncated, info = await env.step(
+            request.action
+        )
+        # Clean up if episode is done
+        if terminated or truncated:
+            logger.info(f"Episode {request.episode_id} completed")
+        return StepResponse(
+            observation=observation,
+            reward=reward,
+            reward_breakdown=reward_breakdown,
+            terminated=terminated,
+            truncated=truncated,
+            info=info,
+        )
+    except Exception as e:
+        logger.error(f"Step failed: {e}")
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail=f"Step execution failed: {str(e)}",
+        )
+@router.get(
+    "/state/{episode_id}",
+    response_model=EpisodeState,
+    status_code=status.HTTP_200_OK,
+    summary="Get episode state",
+    description="Get the current state of an episode",
+)
+async def get_episode_state(episode_id: str) -> EpisodeState:
+    """
+    Get the current state of an episode.
+    Args:
+        episode_id: ID of the episode.
+    Returns:
+        EpisodeState: Current episode state.
+    """
+    env = get_environment(episode_id)
+    state = env.get_state()
+    return EpisodeState(
+        episode_id=episode_id,
+        task_id=state["task_id"],
+        step_number=state["step_number"],
+        current_url=state["current_url"],
+        is_terminal=state["is_terminal"],
+        total_reward=state["total_reward"],
+        extracted_data=state["extracted_data"],
+    )
+@router.delete(
+    "/{episode_id}",
+    status_code=status.HTTP_204_NO_CONTENT,
+    summary="Delete episode",
+    description="Clean up and delete an episode",
+)
+async def delete_episode(episode_id: str) -> None:
+    """
+    Delete an episode and clean up resources.
+    Args:
+        episode_id: ID of the episode to delete.
+    """
+    if not remove_environment(episode_id):
+        raise HTTPException(
+            status_code=status.HTTP_404_NOT_FOUND,
+            detail=f"Episode {episode_id} not found",
+        )
+    logger.info(f"Deleted episode {episode_id}")
+@router.get(
+    "/",
+    response_model=EpisodeListResponse,
+    status_code=status.HTTP_200_OK,
+    summary="List episodes",
+    description="List all active episodes",
+)
+async def list_episodes() -> EpisodeListResponse:
+    """
+    List all active episodes.
+    Returns:
+        EpisodeListResponse: List of active episode IDs.
+    """
+    episodes = list_environments()
+    return EpisodeListResponse(
+        episodes=episodes,
+        count=len(episodes),
+    )

backend/app/api/routes/health.py ADDED Viewed

	@@ -0,0 +1,142 @@

+"""Health check endpoints."""
+import logging
+from datetime import datetime, timezone
+from typing import Any
+from fastapi import APIRouter, status
+from pydantic import BaseModel
+from app.config import get_settings
+router = APIRouter()
+logger = logging.getLogger(__name__)
+class HealthResponse(BaseModel):
+    """Health check response model."""
+    status: str
+    timestamp: str
+    version: str
+    uptime_seconds: float | None = None
+class ReadyResponse(BaseModel):
+    """Readiness check response model."""
+    ready: bool
+    checks: dict[str, bool]
+    details: dict[str, Any] | None = None
+# Track startup time
+_startup_time: datetime | None = None
+def set_startup_time() -> None:
+    """Set the startup time for uptime calculation."""
+    global _startup_time
+    _startup_time = datetime.now(timezone.utc)
+@router.get(
+    "/health",
+    response_model=HealthResponse,
+    status_code=status.HTTP_200_OK,
+    summary="Health check",
+    description="Basic health check endpoint",
+)
+async def health_check() -> HealthResponse:
+    """
+    Perform a basic health check.
+    Returns:
+        HealthResponse: Current health status of the application.
+    """
+    settings = get_settings()
+    now = datetime.now(timezone.utc)
+    uptime = None
+    if _startup_time:
+        uptime = (now - _startup_time).total_seconds()
+    return HealthResponse(
+        status="healthy",
+        timestamp=now.isoformat(),
+        version=settings.app_version,
+        uptime_seconds=uptime,
+    )
+@router.get(
+    "/ready",
+    response_model=ReadyResponse,
+    status_code=status.HTTP_200_OK,
+    summary="Readiness check",
+    description="Check if the application is ready to serve requests",
+)
+async def readiness_check() -> ReadyResponse:
+    """
+    Perform a readiness check.
+    Checks:
+        - Memory manager availability
+        - Model router availability
+        - Tool registry availability
+    Returns:
+        ReadyResponse: Readiness status with individual check results.
+    """
+    checks: dict[str, bool] = {}
+    details: dict[str, Any] = {}
+    # Check memory manager
+    try:
+        from app.main import get_memory_manager
+        memory_manager = get_memory_manager()
+        checks["memory_manager"] = memory_manager is not None
+    except Exception as e:
+        checks["memory_manager"] = False
+        details["memory_manager_error"] = str(e)
+    # Check model router
+    try:
+        from app.main import get_model_router
+        model_router = get_model_router()
+        checks["model_router"] = model_router is not None
+        if model_router:
+            details["available_providers"] = model_router.list_providers()
+    except Exception as e:
+        checks["model_router"] = False
+        details["model_router_error"] = str(e)
+    # Check tool registry
+    try:
+        from app.main import get_tool_registry
+        tool_registry = get_tool_registry()
+        checks["tool_registry"] = tool_registry is not None
+        if tool_registry:
+            details["registered_tools"] = len(tool_registry.list_tools())
+    except Exception as e:
+        checks["tool_registry"] = False
+        details["tool_registry_error"] = str(e)
+    all_ready = all(checks.values())
+    return ReadyResponse(
+        ready=all_ready,
+        checks=checks,
+        details=details if details else None,
+    )
+@router.get(
+    "/ping",
+    status_code=status.HTTP_200_OK,
+    summary="Ping endpoint",
+    description="Simple ping endpoint for load balancers",
+)
+async def ping() -> dict[str, str]:
+    """Simple ping endpoint."""
+    return {"ping": "pong"}

backend/app/api/routes/memory.py ADDED Viewed

	@@ -0,0 +1,344 @@

+"""Memory management endpoints."""
+import logging
+from datetime import datetime, timezone
+from enum import Enum
+from typing import Any
+from uuid import uuid4
+from fastapi import APIRouter, HTTPException, status
+from pydantic import BaseModel, Field
+router = APIRouter(prefix="/memory")
+logger = logging.getLogger(__name__)
+class MemoryType(str, Enum):
+    """Types of memory layers."""
+    SHORT_TERM = "short_term"
+    WORKING = "working"
+    LONG_TERM = "long_term"
+    SHARED = "shared"
+class MemoryEntry(BaseModel):
+    """A single memory entry."""
+    id: str
+    memory_type: MemoryType
+    content: dict[str, Any]
+    metadata: dict[str, Any] = Field(default_factory=dict)
+    timestamp: str
+    episode_id: str | None = None
+    agent_id: str | None = None
+    relevance_score: float | None = None
+    embedding: list[float] | None = None
+class MemoryQueryRequest(BaseModel):
+    """Request for querying memory."""
+    query: str
+    memory_types: list[MemoryType] = Field(default_factory=lambda: list(MemoryType))
+    episode_id: str | None = None
+    limit: int = 10
+    min_relevance: float = 0.0
+class MemoryQueryResponse(BaseModel):
+    """Response from memory query."""
+    entries: list[MemoryEntry]
+    total_found: int
+    query: str
+class MemoryStoreRequest(BaseModel):
+    """Request to store a memory entry."""
+    memory_type: MemoryType
+    content: dict[str, Any]
+    metadata: dict[str, Any] = Field(default_factory=dict)
+    episode_id: str | None = None
+    agent_id: str | None = None
+class MemoryStats(BaseModel):
+    """Statistics about memory usage."""
+    short_term_count: int
+    working_count: int
+    long_term_count: int
+    shared_count: int
+    total_count: int
+    oldest_entry: str | None = None
+    newest_entry: str | None = None
+# In-memory storage (would use actual memory layers in production)
+_memory_store: dict[str, MemoryEntry] = {}
+@router.post(
+    "/store",
+    response_model=MemoryEntry,
+    status_code=status.HTTP_201_CREATED,
+    summary="Store memory entry",
+    description="Store a new memory entry",
+)
+async def store_memory(request: MemoryStoreRequest) -> MemoryEntry:
+    """
+    Store a new memory entry.
+    Args:
+        request: Memory storage request.
+    Returns:
+        MemoryEntry: Stored memory entry.
+    """
+    entry_id = str(uuid4())
+    timestamp = datetime.now(timezone.utc).isoformat()
+    entry = MemoryEntry(
+        id=entry_id,
+        memory_type=request.memory_type,
+        content=request.content,
+        metadata=request.metadata,
+        timestamp=timestamp,
+        episode_id=request.episode_id,
+        agent_id=request.agent_id,
+    )
+    _memory_store[entry_id] = entry
+    logger.info(f"Stored memory entry {entry_id} ({request.memory_type})")
+    return entry
+@router.post(
+    "/query",
+    response_model=MemoryQueryResponse,
+    status_code=status.HTTP_200_OK,
+    summary="Query memory",
+    description="Query memory entries by semantic similarity or filters",
+)
+async def query_memory(request: MemoryQueryRequest) -> MemoryQueryResponse:
+    """
+    Query memory entries.
+    Args:
+        request: Memory query request.
+    Returns:
+        MemoryQueryResponse: Matching memory entries.
+    """
+    logger.info(f"Querying memory: '{request.query[:50]}...'")
+    # Filter entries
+    entries = list(_memory_store.values())
+    # Filter by memory type
+    if request.memory_types:
+        entries = [e for e in entries if e.memory_type in request.memory_types]
+    # Filter by episode
+    if request.episode_id:
+        entries = [e for e in entries if e.episode_id == request.episode_id]
+    # Simple text matching (would use embeddings in production)
+    query_lower = request.query.lower()
+    scored_entries = []
+    for entry in entries:
+        content_str = str(entry.content).lower()
+        if query_lower in content_str:
+            score = content_str.count(query_lower) / len(content_str.split())
+            entry.relevance_score = min(score * 10, 1.0)
+            if entry.relevance_score >= request.min_relevance:
+                scored_entries.append(entry)
+    # Sort by relevance and limit
+    scored_entries.sort(key=lambda e: e.relevance_score or 0, reverse=True)
+    result_entries = scored_entries[: request.limit]
+    return MemoryQueryResponse(
+        entries=result_entries,
+        total_found=len(scored_entries),
+        query=request.query,
+    )
+@router.get(
+    "/{entry_id}",
+    response_model=MemoryEntry,
+    status_code=status.HTTP_200_OK,
+    summary="Get memory entry",
+    description="Get a specific memory entry by ID",
+)
+async def get_memory_entry(entry_id: str) -> MemoryEntry:
+    """
+    Get a specific memory entry.
+    Args:
+        entry_id: ID of the memory entry.
+    Returns:
+        MemoryEntry: The memory entry.
+    """
+    if entry_id not in _memory_store:
+        raise HTTPException(
+            status_code=status.HTTP_404_NOT_FOUND,
+            detail=f"Memory entry {entry_id} not found",
+        )
+    return _memory_store[entry_id]
+@router.put(
+    "/{entry_id}",
+    response_model=MemoryEntry,
+    status_code=status.HTTP_200_OK,
+    summary="Update memory entry",
+    description="Update an existing memory entry",
+)
+async def update_memory_entry(
+    entry_id: str,
+    content: dict[str, Any],
+    metadata: dict[str, Any] | None = None,
+) -> MemoryEntry:
+    """
+    Update a memory entry.
+    Args:
+        entry_id: ID of the entry to update.
+        content: New content.
+        metadata: Optional new metadata.
+    Returns:
+        MemoryEntry: Updated entry.
+    """
+    if entry_id not in _memory_store:
+        raise HTTPException(
+            status_code=status.HTTP_404_NOT_FOUND,
+            detail=f"Memory entry {entry_id} not found",
+        )
+    entry = _memory_store[entry_id]
+    entry.content = content
+    if metadata:
+        entry.metadata.update(metadata)
+    entry.timestamp = datetime.now(timezone.utc).isoformat()
+    logger.info(f"Updated memory entry {entry_id}")
+    return entry
+@router.delete(
+    "/{entry_id}",
+    status_code=status.HTTP_204_NO_CONTENT,
+    summary="Delete memory entry",
+    description="Delete a memory entry",
+)
+async def delete_memory_entry(entry_id: str) -> None:
+    """
+    Delete a memory entry.
+    Args:
+        entry_id: ID of the entry to delete.
+    """
+    if entry_id not in _memory_store:
+        raise HTTPException(
+            status_code=status.HTTP_404_NOT_FOUND,
+            detail=f"Memory entry {entry_id} not found",
+        )
+    del _memory_store[entry_id]
+    logger.info(f"Deleted memory entry {entry_id}")
+@router.get(
+    "/stats/overview",
+    response_model=MemoryStats,
+    status_code=status.HTTP_200_OK,
+    summary="Get memory stats",
+    description="Get statistics about memory usage",
+)
+async def get_memory_stats() -> MemoryStats:
+    """
+    Get memory statistics.
+    Returns:
+        MemoryStats: Memory usage statistics.
+    """
+    entries = list(_memory_store.values())
+    counts = {mt: 0 for mt in MemoryType}
+    for entry in entries:
+        counts[entry.memory_type] += 1
+    timestamps = [e.timestamp for e in entries]
+    return MemoryStats(
+        short_term_count=counts[MemoryType.SHORT_TERM],
+        working_count=counts[MemoryType.WORKING],
+        long_term_count=counts[MemoryType.LONG_TERM],
+        shared_count=counts[MemoryType.SHARED],
+        total_count=len(entries),
+        oldest_entry=min(timestamps) if timestamps else None,
+        newest_entry=max(timestamps) if timestamps else None,
+    )
+@router.delete(
+    "/clear/{memory_type}",
+    status_code=status.HTTP_204_NO_CONTENT,
+    summary="Clear memory layer",
+    description="Clear all entries from a memory layer",
+)
+async def clear_memory_layer(memory_type: MemoryType) -> None:
+    """
+    Clear all entries from a memory layer.
+    Args:
+        memory_type: Type of memory to clear.
+    """
+    global _memory_store
+    to_delete = [k for k, v in _memory_store.items() if v.memory_type == memory_type]
+    for key in to_delete:
+        del _memory_store[key]
+    logger.info(f"Cleared {len(to_delete)} entries from {memory_type}")
+@router.post(
+    "/consolidate",
+    status_code=status.HTTP_200_OK,
+    summary="Consolidate memory",
+    description="Consolidate short-term memory into long-term memory",
+)
+async def consolidate_memory(episode_id: str | None = None) -> dict[str, Any]:
+    """
+    Consolidate memory from short-term to long-term.
+    Args:
+        episode_id: Optional episode to consolidate.
+    Returns:
+        Consolidation result.
+    """
+    entries = list(_memory_store.values())
+    if episode_id:
+        entries = [e for e in entries if e.episode_id == episode_id]
+    short_term = [e for e in entries if e.memory_type == MemoryType.SHORT_TERM]
+    consolidated = 0
+    for entry in short_term:
+        entry.memory_type = MemoryType.LONG_TERM
+        consolidated += 1
+    logger.info(f"Consolidated {consolidated} entries to long-term memory")
+    return {
+        "consolidated_count": consolidated,
+        "episode_id": episode_id,
+    }

backend/app/api/routes/tasks.py ADDED Viewed

	@@ -0,0 +1,254 @@

+"""Tasks management endpoints."""
+import logging
+from enum import Enum
+from typing import Any
+from fastapi import APIRouter, HTTPException, status
+from pydantic import BaseModel, Field
+router = APIRouter(prefix="/tasks")
+logger = logging.getLogger(__name__)
+class TaskDifficulty(str, Enum):
+    """Task difficulty levels."""
+    EASY = "easy"
+    MEDIUM = "medium"
+    HARD = "hard"
+    EXPERT = "expert"
+class TaskType(str, Enum):
+    """Types of scraping tasks."""
+    SINGLE_PAGE = "single_page"
+    MULTI_PAGE = "multi_page"
+    SEARCH_EXTRACT = "search_extract"
+    FORM_FILL = "form_fill"
+    DYNAMIC_CONTENT = "dynamic_content"
+    AUTHENTICATION = "authentication"
+class FieldSchema(BaseModel):
+    """Schema for a field to extract."""
+    name: str
+    description: str
+    field_type: str = "string"
+    required: bool = True
+    validation_pattern: str | None = None
+class Task(BaseModel):
+    """A scraping task definition."""
+    id: str
+    name: str
+    description: str
+    task_type: TaskType
+    difficulty: TaskDifficulty
+    target_url: str | None = None
+    target_domain: str | None = None
+    fields_to_extract: list[FieldSchema]
+    success_criteria: dict[str, Any]
+    hints: list[str] = Field(default_factory=list)
+    max_steps: int = 50
+    timeout_seconds: float = 300.0
+    tags: list[str] = Field(default_factory=list)
+class TaskListResponse(BaseModel):
+    """Response for listing tasks."""
+    tasks: list[Task]
+    total: int
+    page: int
+    page_size: int
+class TaskProgress(BaseModel):
+    """Progress on a task within an episode."""
+    task_id: str
+    fields_extracted: int
+    fields_total: int
+    steps_taken: int
+    max_steps: int
+    accuracy_estimate: float
+    completion_percentage: float
+# Sample task repository (would be database-backed in production)
+TASK_REPOSITORY: dict[str, Task] = {
+    "task_001": Task(
+        id="task_001",
+        name="Extract Product Details",
+        description="Extract product name, price, and description from an e-commerce page",
+        task_type=TaskType.SINGLE_PAGE,
+        difficulty=TaskDifficulty.EASY,
+        target_url="https://example.com/product/123",
+        fields_to_extract=[
+            FieldSchema(name="product_name", description="The name of the product"),
+            FieldSchema(name="price", description="Current price", field_type="number"),
+            FieldSchema(name="description", description="Product description"),
+        ],
+        success_criteria={"min_accuracy": 0.9, "required_fields": ["product_name", "price"]},
+        hints=["Look for h1 tags for product name", "Price often in span with class containing 'price'"],
+        tags=["ecommerce", "product"],
+    ),
+    "task_002": Task(
+        id="task_002",
+        name="Search and Extract Company Info",
+        description="Search for a company and extract key information from search results",
+        task_type=TaskType.SEARCH_EXTRACT,
+        difficulty=TaskDifficulty.MEDIUM,
+        target_domain="linkedin.com",
+        fields_to_extract=[
+            FieldSchema(name="company_name", description="Official company name"),
+            FieldSchema(name="industry", description="Primary industry"),
+            FieldSchema(name="employee_count", description="Number of employees", field_type="string"),
+            FieldSchema(name="headquarters", description="Location of headquarters"),
+        ],
+        success_criteria={"min_accuracy": 0.8, "required_fields": ["company_name", "industry"]},
+        tags=["search", "company", "linkedin"],
+        max_steps=30,
+    ),
+    "task_003": Task(
+        id="task_003",
+        name="Multi-page Article Extraction",
+        description="Navigate through paginated articles and extract all content",
+        task_type=TaskType.MULTI_PAGE,
+        difficulty=TaskDifficulty.HARD,
+        target_domain="news-site.example.com",
+        fields_to_extract=[
+            FieldSchema(name="articles", description="List of article data", field_type="array"),
+        ],
+        success_criteria={"min_articles": 10, "min_accuracy": 0.85},
+        tags=["pagination", "articles", "news"],
+        max_steps=100,
+    ),
+}
+@router.get(
+    "/",
+    response_model=TaskListResponse,
+    status_code=status.HTTP_200_OK,
+    summary="List available tasks",
+    description="Get a paginated list of available scraping tasks",
+)
+async def list_tasks(
+    page: int = 1,
+    page_size: int = 20,
+    difficulty: TaskDifficulty | None = None,
+    task_type: TaskType | None = None,
+    tag: str | None = None,
+) -> TaskListResponse:
+    """
+    List available tasks with optional filtering.
+    Args:
+        page: Page number (1-indexed).
+        page_size: Number of tasks per page.
+        difficulty: Filter by difficulty level.
+        task_type: Filter by task type.
+        tag: Filter by tag.
+    Returns:
+        TaskListResponse: Paginated list of tasks.
+    """
+    tasks = list(TASK_REPOSITORY.values())
+    # Apply filters
+    if difficulty:
+        tasks = [t for t in tasks if t.difficulty == difficulty]
+    if task_type:
+        tasks = [t for t in tasks if t.task_type == task_type]
+    if tag:
+        tasks = [t for t in tasks if tag in t.tags]
+    # Paginate
+    total = len(tasks)
+    start = (page - 1) * page_size
+    end = start + page_size
+    paginated_tasks = tasks[start:end]
+    return TaskListResponse(
+        tasks=paginated_tasks,
+        total=total,
+        page=page,
+        page_size=page_size,
+    )
+@router.get(
+    "/{task_id}",
+    response_model=Task,
+    status_code=status.HTTP_200_OK,
+    summary="Get task details",
+    description="Get details of a specific task",
+)
+async def get_task(task_id: str) -> Task:
+    """
+    Get details of a specific task.
+    Args:
+        task_id: ID of the task.
+    Returns:
+        Task: Task details.
+    """
+    if task_id not in TASK_REPOSITORY:
+        raise HTTPException(
+            status_code=status.HTTP_404_NOT_FOUND,
+            detail=f"Task {task_id} not found",
+        )
+    return TASK_REPOSITORY[task_id]
+@router.post(
+    "/",
+    response_model=Task,
+    status_code=status.HTTP_201_CREATED,
+    summary="Create a new task",
+    description="Create a new scraping task",
+)
+async def create_task(task: Task) -> Task:
+    """
+    Create a new task.
+    Args:
+        task: Task definition.
+    Returns:
+        Task: Created task.
+    """
+    if task.id in TASK_REPOSITORY:
+        raise HTTPException(
+            status_code=status.HTTP_409_CONFLICT,
+            detail=f"Task {task.id} already exists",
+        )
+    TASK_REPOSITORY[task.id] = task
+    logger.info(f"Created task {task.id}: {task.name}")
+    return task
+@router.get(
+    "/types/",
+    status_code=status.HTTP_200_OK,
+    summary="Get task types",
+    description="Get all available task types",
+)
+async def get_task_types() -> dict[str, list[str]]:
+    """
+    Get available task types and difficulties.
+    Returns:
+        Dict with task types and difficulties.
+    """
+    return {
+        "task_types": [t.value for t in TaskType],
+        "difficulties": [d.value for d in TaskDifficulty],
+    }

backend/app/api/routes/tools.py ADDED Viewed

	@@ -0,0 +1,333 @@

+"""Tool registry and testing endpoints."""
+import logging
+from typing import Any
+from fastapi import APIRouter, HTTPException, status
+from pydantic import BaseModel, Field
+router = APIRouter(prefix="/tools")
+logger = logging.getLogger(__name__)
+class ToolParameter(BaseModel):
+    """Parameter definition for a tool."""
+    name: str
+    type: str
+    description: str
+    required: bool = True
+    default: Any | None = None
+class ToolDefinition(BaseModel):
+    """Definition of a tool in the registry."""
+    name: str
+    description: str
+    category: str
+    parameters: list[ToolParameter]
+    returns: str
+    examples: list[dict[str, Any]] = Field(default_factory=list)
+    requires_browser: bool = False
+    cost_estimate: float = 0.0
+class ToolRegistryResponse(BaseModel):
+    """Response containing the tool registry."""
+    tools: list[ToolDefinition]
+    categories: list[str]
+    total_count: int
+class ToolTestRequest(BaseModel):
+    """Request to test a tool."""
+    tool_name: str
+    parameters: dict[str, Any] = Field(default_factory=dict)
+    dry_run: bool = True
+class ToolTestResponse(BaseModel):
+    """Response from tool testing."""
+    tool_name: str
+    success: bool
+    result: Any | None = None
+    error: str | None = None
+    execution_time_ms: float = 0.0
+    dry_run: bool
+# Tool definitions (would be dynamically registered in production)
+TOOL_DEFINITIONS: list[ToolDefinition] = [
+    ToolDefinition(
+        name="navigate_to",
+        description="Navigate the browser to a specified URL",
+        category="browser",
+        parameters=[
+            ToolParameter(name="url", type="string", description="URL to navigate to"),
+            ToolParameter(name="wait_for", type="string", description="CSS selector to wait for", required=False),
+        ],
+        returns="NavigationResult with page info",
+        requires_browser=True,
+        cost_estimate=0.01,
+    ),
+    ToolDefinition(
+        name="click_element",
+        description="Click on an element identified by selector",
+        category="browser",
+        parameters=[
+            ToolParameter(name="selector", type="string", description="CSS selector of element to click"),
+        ],
+        returns="ClickResult with success status",
+        requires_browser=True,
+        cost_estimate=0.005,
+    ),
+    ToolDefinition(
+        name="extract_text",
+        description="Extract text content from elements",
+        category="extraction",
+        parameters=[
+            ToolParameter(name="selector", type="string", description="CSS selector to extract from"),
+            ToolParameter(name="multiple", type="boolean", description="Extract from all matches", default=False),
+        ],
+        returns="Extracted text or list of texts",
+        requires_browser=True,
+        cost_estimate=0.002,
+    ),
+    ToolDefinition(
+        name="extract_attribute",
+        description="Extract attribute value from element",
+        category="extraction",
+        parameters=[
+            ToolParameter(name="selector", type="string", description="CSS selector"),
+            ToolParameter(name="attribute", type="string", description="Attribute name to extract"),
+        ],
+        returns="Attribute value",
+        requires_browser=True,
+        cost_estimate=0.002,
+    ),
+    ToolDefinition(
+        name="search_engine",
+        description="Perform a search using a search engine",
+        category="search",
+        parameters=[
+            ToolParameter(name="query", type="string", description="Search query"),
+            ToolParameter(name="engine", type="string", description="Search engine", default="google"),
+            ToolParameter(name="num_results", type="integer", description="Number of results", default=10),
+        ],
+        returns="List of search results",
+        cost_estimate=0.05,
+    ),
+    ToolDefinition(
+        name="fill_form",
+        description="Fill a form field with a value",
+        category="browser",
+        parameters=[
+            ToolParameter(name="selector", type="string", description="CSS selector of form field"),
+            ToolParameter(name="value", type="string", description="Value to fill"),
+        ],
+        returns="FillResult with success status",
+        requires_browser=True,
+        cost_estimate=0.005,
+    ),
+    ToolDefinition(
+        name="screenshot",
+        description="Take a screenshot of the current page",
+        category="browser",
+        parameters=[
+            ToolParameter(name="full_page", type="boolean", description="Capture full page", default=False),
+        ],
+        returns="Base64 encoded screenshot",
+        requires_browser=True,
+        cost_estimate=0.01,
+    ),
+    ToolDefinition(
+        name="get_page_html",
+        description="Get the full HTML content of the current page",
+        category="extraction",
+        parameters=[],
+        returns="HTML string",
+        requires_browser=True,
+        cost_estimate=0.001,
+    ),
+    ToolDefinition(
+        name="wait_for_selector",
+        description="Wait for an element to appear on the page",
+        category="browser",
+        parameters=[
+            ToolParameter(name="selector", type="string", description="CSS selector to wait for"),
+            ToolParameter(name="timeout_ms", type="integer", description="Timeout in milliseconds", default=30000),
+        ],
+        returns="Boolean indicating if element appeared",
+        requires_browser=True,
+        cost_estimate=0.001,
+    ),
+    ToolDefinition(
+        name="scroll_to",
+        description="Scroll to a position or element",
+        category="browser",
+        parameters=[
+            ToolParameter(name="selector", type="string", description="CSS selector", required=False),
+            ToolParameter(name="position", type="string", description="Position: top, bottom, or pixel value", required=False),
+        ],
+        returns="ScrollResult",
+        requires_browser=True,
+        cost_estimate=0.001,
+    ),
+]
+@router.get(
+    "/registry",
+    response_model=ToolRegistryResponse,
+    status_code=status.HTTP_200_OK,
+    summary="Get tool registry",
+    description="Get all available tools in the registry",
+)
+async def get_tool_registry(category: str | None = None) -> ToolRegistryResponse:
+    """
+    Get the tool registry with all available tools.
+    Args:
+        category: Optional filter by category.
+    Returns:
+        ToolRegistryResponse: List of available tools.
+    """
+    tools = TOOL_DEFINITIONS
+    if category:
+        tools = [t for t in tools if t.category == category]
+    categories = list(set(t.category for t in TOOL_DEFINITIONS))
+    return ToolRegistryResponse(
+        tools=tools,
+        categories=categories,
+        total_count=len(tools),
+    )
+@router.get(
+    "/registry/{tool_name}",
+    response_model=ToolDefinition,
+    status_code=status.HTTP_200_OK,
+    summary="Get tool details",
+    description="Get details of a specific tool",
+)
+async def get_tool_details(tool_name: str) -> ToolDefinition:
+    """
+    Get details of a specific tool.
+    Args:
+        tool_name: Name of the tool.
+    Returns:
+        ToolDefinition: Tool details.
+    """
+    for tool in TOOL_DEFINITIONS:
+        if tool.name == tool_name:
+            return tool
+    raise HTTPException(
+        status_code=status.HTTP_404_NOT_FOUND,
+        detail=f"Tool '{tool_name}' not found",
+    )
+@router.post(
+    "/test",
+    response_model=ToolTestResponse,
+    status_code=status.HTTP_200_OK,
+    summary="Test a tool",
+    description="Test a tool with provided parameters",
+)
+async def test_tool(request: ToolTestRequest) -> ToolTestResponse:
+    """
+    Test a tool execution.
+    Args:
+        request: Tool test request.
+    Returns:
+        ToolTestResponse: Result of tool test.
+    """
+    import time
+    start_time = time.time()
+    logger.info(f"Testing tool '{request.tool_name}' with dry_run={request.dry_run}")
+    # Find the tool
+    tool = None
+    for t in TOOL_DEFINITIONS:
+        if t.name == request.tool_name:
+            tool = t
+            break
+    if not tool:
+        raise HTTPException(
+            status_code=status.HTTP_404_NOT_FOUND,
+            detail=f"Tool '{request.tool_name}' not found",
+        )
+    try:
+        # Validate required parameters
+        for param in tool.parameters:
+            if param.required and param.name not in request.parameters:
+                raise ValueError(f"Missing required parameter: {param.name}")
+        if request.dry_run:
+            # Return mock result for dry run
+            result = {
+                "status": "dry_run",
+                "tool": request.tool_name,
+                "parameters": request.parameters,
+                "would_require_browser": tool.requires_browser,
+            }
+        else:
+            # Actually execute the tool (placeholder)
+            from app.tools.registry import MCPToolRegistry
+            registry = MCPToolRegistry()
+            result = await registry.execute_tool(request.tool_name, request.parameters)
+        execution_time = (time.time() - start_time) * 1000
+        return ToolTestResponse(
+            tool_name=request.tool_name,
+            success=True,
+            result=result,
+            execution_time_ms=execution_time,
+            dry_run=request.dry_run,
+        )
+    except Exception as e:
+        execution_time = (time.time() - start_time) * 1000
+        logger.error(f"Tool test failed: {e}")
+        return ToolTestResponse(
+            tool_name=request.tool_name,
+            success=False,
+            error=str(e),
+            execution_time_ms=execution_time,
+            dry_run=request.dry_run,
+        )
+@router.get(
+    "/categories",
+    status_code=status.HTTP_200_OK,
+    summary="Get tool categories",
+    description="Get all tool categories",
+)
+async def get_categories() -> dict[str, list[str]]:
+    """
+    Get all tool categories.
+    Returns:
+        Dict with category information.
+    """
+    categories = {}
+    for tool in TOOL_DEFINITIONS:
+        if tool.category not in categories:
+            categories[tool.category] = []
+        categories[tool.category].append(tool.name)
+    return {"categories": categories}

backend/app/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,22 @@

+"""Utility modules for ScrapeRL backend."""
+from app.utils.html import (
+    parse_html,
+    clean_html,
+    extract_text,
+    semantic_chunk,
+    extract_links,
+    extract_tables,
+)
+from app.utils.logging import setup_logging, get_logger
+__all__ = [
+    "parse_html",
+    "clean_html",
+    "extract_text",
+    "semantic_chunk",
+    "extract_links",
+    "extract_tables",
+    "setup_logging",
+    "get_logger",
+]

backend/app/utils/__pycache__/__init__.cpython-314.pyc ADDED Viewed

Binary file (492 Bytes). View file

backend/app/utils/__pycache__/html.cpython-314.pyc ADDED Viewed

Binary file (10.7 kB). View file

backend/app/utils/__pycache__/logging.cpython-314.pyc ADDED Viewed

Binary file (2.54 kB). View file

backend/app/utils/html.py ADDED Viewed

	@@ -0,0 +1,286 @@

+"""HTML processing utilities for ScrapeRL backend."""
+import re
+from typing import Any, Optional
+from bs4 import BeautifulSoup, Tag, NavigableString
+from app.utils.logging import get_logger
+logger = get_logger(__name__)
+def parse_html(html: str, parser: str = "html.parser") -> BeautifulSoup:
+    """
+    Parse HTML string into a BeautifulSoup object.
+    Args:
+        html: Raw HTML string
+        parser: Parser to use (html.parser, lxml, html5lib)
+    Returns:
+        Parsed BeautifulSoup object
+    """
+    return BeautifulSoup(html, parser)
+def clean_html(
+    html: str,
+    remove_scripts: bool = True,
+    remove_styles: bool = True,
+    remove_comments: bool = True,
+    remove_tags: Optional[list[str]] = None,
+) -> str:
+    """
+    Clean HTML by removing unwanted elements.
+    Args:
+        html: Raw HTML string
+        remove_scripts: Remove <script> tags
+        remove_styles: Remove <style> tags
+        remove_comments: Remove HTML comments
+        remove_tags: Additional tags to remove
+    Returns:
+        Cleaned HTML string
+    """
+    soup = parse_html(html)
+    # Remove script tags
+    if remove_scripts:
+        for script in soup.find_all("script"):
+            script.decompose()
+    # Remove style tags
+    if remove_styles:
+        for style in soup.find_all("style"):
+            style.decompose()
+    # Remove comments
+    if remove_comments:
+        from bs4 import Comment
+        for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
+            comment.extract()
+    # Remove additional specified tags
+    if remove_tags:
+        for tag_name in remove_tags:
+            for tag in soup.find_all(tag_name):
+                tag.decompose()
+    return str(soup)
+def extract_text(
+    html: str,
+    separator: str = " ",
+    strip: bool = True,
+) -> str:
+    """
+    Extract plain text from HTML.
+    Args:
+        html: Raw HTML string
+        separator: String to join text segments
+        strip: Strip whitespace from result
+    Returns:
+        Extracted plain text
+    """
+    soup = parse_html(html)
+    # Remove script and style elements
+    for element in soup(["script", "style", "noscript"]):
+        element.decompose()
+    text = soup.get_text(separator=separator)
+    if strip:
+        # Normalize whitespace
+        text = re.sub(r"\s+", " ", text).strip()
+    return text
+def semantic_chunk(
+    html: str,
+    max_chunk_size: int = 4000,
+    overlap: int = 200,
+) -> list[dict[str, Any]]:
+    """
+    Split HTML content into semantic chunks based on structure.
+    Args:
+        html: Raw HTML string
+        max_chunk_size: Maximum characters per chunk
+        overlap: Number of characters to overlap between chunks
+    Returns:
+        List of chunk dictionaries with text and metadata
+    """
+    soup = parse_html(html)
+    chunks: list[dict[str, Any]] = []
+    # Remove non-content elements
+    for element in soup(["script", "style", "noscript", "nav", "footer", "header"]):
+        element.decompose()
+    # Find semantic boundaries
+    semantic_tags = ["article", "section", "div", "p", "h1", "h2", "h3", "h4", "h5", "h6"]
+    def get_text_content(element: Tag | NavigableString) -> str:
+        if isinstance(element, NavigableString):
+            return str(element).strip()
+        return element.get_text(separator=" ", strip=True)
+    current_chunk = ""
+    current_metadata: dict[str, Any] = {"tags": [], "headings": []}
+    for element in soup.find_all(semantic_tags):
+        text = get_text_content(element)
+        if not text:
+            continue
+        tag_name = element.name if isinstance(element, Tag) else "text"
+        # Check if adding this would exceed max size
+        if len(current_chunk) + len(text) + 1 > max_chunk_size:
+            if current_chunk:
+                chunks.append({
+                    "text": current_chunk.strip(),
+                    "metadata": current_metadata.copy(),
+                    "char_count": len(current_chunk),
+                })
+            # Start new chunk with overlap
+            if overlap > 0 and current_chunk:
+                current_chunk = current_chunk[-overlap:] + " " + text
+            else:
+                current_chunk = text
+            current_metadata = {"tags": [tag_name], "headings": []}
+        else:
+            current_chunk += " " + text if current_chunk else text
+            current_metadata["tags"].append(tag_name)
+        # Track headings
+        if tag_name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
+            current_metadata["headings"].append(text[:100])
+    # Add remaining content
+    if current_chunk.strip():
+        chunks.append({
+            "text": current_chunk.strip(),
+            "metadata": current_metadata,
+            "char_count": len(current_chunk),
+        })
+    # If no semantic chunks found, fall back to simple chunking
+    if not chunks:
+        text = extract_text(html)
+        for i in range(0, len(text), max_chunk_size - overlap):
+            chunk_text = text[i : i + max_chunk_size]
+            if chunk_text.strip():
+                chunks.append({
+                    "text": chunk_text.strip(),
+                    "metadata": {"tags": [], "headings": []},
+                    "char_count": len(chunk_text),
+                })
+    return chunks
+def extract_links(
+    html: str,
+    base_url: Optional[str] = None,
+    include_text: bool = True,
+) -> list[dict[str, str]]:
+    """
+    Extract all links from HTML.
+    Args:
+        html: Raw HTML string
+        base_url: Base URL for resolving relative links
+        include_text: Include link text in results
+    Returns:
+        List of link dictionaries with href and optionally text
+    """
+    from urllib.parse import urljoin
+    soup = parse_html(html)
+    links: list[dict[str, str]] = []
+    for anchor in soup.find_all("a", href=True):
+        href = anchor.get("href", "")
+        if not href or href.startswith("#") or href.startswith("javascript:"):
+            continue
+        # Resolve relative URLs
+        if base_url and not href.startswith(("http://", "https://", "//")):
+            href = urljoin(base_url, href)
+        link_data: dict[str, str] = {"href": href}
+        if include_text:
+            link_data["text"] = anchor.get_text(strip=True)
+        # Include title if present
+        title = anchor.get("title")
+        if title:
+            link_data["title"] = title
+        links.append(link_data)
+    return links
+def extract_tables(
+    html: str,
+    include_headers: bool = True,
+) -> list[dict[str, Any]]:
+    """
+    Extract tables from HTML as structured data.
+    Args:
+        html: Raw HTML string
+        include_headers: Try to identify and include header rows
+    Returns:
+        List of table dictionaries with headers and rows
+    """
+    soup = parse_html(html)
+    tables: list[dict[str, Any]] = []
+    for table in soup.find_all("table"):
+        table_data: dict[str, Any] = {
+            "headers": [],
+            "rows": [],
+        }
+        # Extract headers from thead or first row
+        if include_headers:
+            thead = table.find("thead")
+            if thead:
+                header_row = thead.find("tr")
+                if header_row:
+                    table_data["headers"] = [
+                        th.get_text(strip=True)
+                        for th in header_row.find_all(["th", "td"])
+                    ]
+        # Extract body rows
+        tbody = table.find("tbody") or table
+        for row in tbody.find_all("tr"):
+            cells = row.find_all(["td", "th"])
+            row_data = [cell.get_text(strip=True) for cell in cells]
+            # If no headers yet and this looks like a header row
+            if include_headers and not table_data["headers"] and row.find("th"):
+                table_data["headers"] = row_data
+            else:
+                if row_data:  # Skip empty rows
+                    table_data["rows"].append(row_data)
+        if table_data["rows"] or table_data["headers"]:
+            tables.append(table_data)
+    return tables

backend/app/utils/logging.py ADDED Viewed

	@@ -0,0 +1,54 @@

+"""Logging utilities for ScrapeRL backend."""
+import logging
+import sys
+from typing import Optional
+def setup_logging(
+    level: str = "INFO",
+    format_string: Optional[str] = None,
+    log_file: Optional[str] = None,
+) -> None:
+    """
+    Configure logging for the application.
+    Args:
+        level: Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
+        format_string: Custom format string for log messages
+        log_file: Optional file path to write logs to
+    """
+    if format_string is None:
+        format_string = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+    log_level = getattr(logging, level.upper(), logging.INFO)
+    handlers: list[logging.Handler] = [logging.StreamHandler(sys.stdout)]
+    if log_file:
+        file_handler = logging.FileHandler(log_file)
+        handlers.append(file_handler)
+    logging.basicConfig(
+        level=log_level,
+        format=format_string,
+        handlers=handlers,
+    )
+    # Reduce noise from third-party libraries
+    logging.getLogger("httpx").setLevel(logging.WARNING)
+    logging.getLogger("httpcore").setLevel(logging.WARNING)
+    logging.getLogger("urllib3").setLevel(logging.WARNING)
+def get_logger(name: str) -> logging.Logger:
+    """
+    Get a logger instance with the specified name.
+    Args:
+        name: Logger name, typically __name__ of the calling module
+    Returns:
+        Configured logger instance
+    """
+    return logging.getLogger(name)