Spaces:
Running
Running
Commit ·
27cde0c
1
Parent(s): afefaea
feat: add API routes and utility modules
Browse files- backend/app/api/__init__.py +5 -0
- backend/app/api/__pycache__/__init__.cpython-314.pyc +0 -0
- backend/app/api/__pycache__/deps.cpython-314.pyc +0 -0
- backend/app/api/deps.py +118 -0
- backend/app/api/routes/__init__.py +5 -0
- backend/app/api/routes/__pycache__/__init__.cpython-314.pyc +0 -0
- backend/app/api/routes/__pycache__/agents.cpython-314.pyc +0 -0
- backend/app/api/routes/__pycache__/episode.cpython-314.pyc +0 -0
- backend/app/api/routes/__pycache__/health.cpython-314.pyc +0 -0
- backend/app/api/routes/__pycache__/memory.cpython-314.pyc +0 -0
- backend/app/api/routes/__pycache__/tasks.cpython-314.pyc +0 -0
- backend/app/api/routes/__pycache__/tools.cpython-314.pyc +0 -0
- backend/app/api/routes/agents.py +301 -0
- backend/app/api/routes/episode.py +235 -0
- backend/app/api/routes/health.py +142 -0
- backend/app/api/routes/memory.py +344 -0
- backend/app/api/routes/tasks.py +254 -0
- backend/app/api/routes/tools.py +333 -0
- backend/app/utils/__init__.py +22 -0
- backend/app/utils/__pycache__/__init__.cpython-314.pyc +0 -0
- backend/app/utils/__pycache__/html.cpython-314.pyc +0 -0
- backend/app/utils/__pycache__/logging.cpython-314.pyc +0 -0
- backend/app/utils/html.py +286 -0
- backend/app/utils/logging.py +54 -0
backend/app/api/__init__.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""API package initialization."""
|
| 2 |
+
|
| 3 |
+
from app.api.routes import agents, episode, health, memory, tasks, tools
|
| 4 |
+
|
| 5 |
+
__all__ = ["agents", "episode", "health", "memory", "tasks", "tools"]
|
backend/app/api/__pycache__/__init__.cpython-314.pyc
ADDED
|
Binary file (349 Bytes). View file
|
|
|
backend/app/api/__pycache__/deps.cpython-314.pyc
ADDED
|
Binary file (7.7 kB). View file
|
|
|
backend/app/api/deps.py
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Dependency injection utilities for API routes."""
|
| 2 |
+
|
| 3 |
+
from typing import Annotated, Generator
|
| 4 |
+
|
| 5 |
+
from fastapi import Depends, HTTPException, status
|
| 6 |
+
|
| 7 |
+
from app.config import Settings, get_settings
|
| 8 |
+
from app.core.env import WebScraperEnv
|
| 9 |
+
from app.memory.manager import MemoryManager
|
| 10 |
+
from app.models.router import SmartModelRouter
|
| 11 |
+
from app.tools.registry import MCPToolRegistry
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
# Settings dependency
|
| 15 |
+
SettingsDep = Annotated[Settings, Depends(get_settings)]
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
# Store for active environments
|
| 19 |
+
_active_environments: dict[str, WebScraperEnv] = {}
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def get_environment(episode_id: str) -> WebScraperEnv:
|
| 23 |
+
"""Get an active environment by episode ID."""
|
| 24 |
+
if episode_id not in _active_environments:
|
| 25 |
+
raise HTTPException(
|
| 26 |
+
status_code=status.HTTP_404_NOT_FOUND,
|
| 27 |
+
detail=f"Episode {episode_id} not found",
|
| 28 |
+
)
|
| 29 |
+
return _active_environments[episode_id]
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def create_environment(episode_id: str, settings: Settings) -> WebScraperEnv:
|
| 33 |
+
"""Create a new environment for an episode."""
|
| 34 |
+
if episode_id in _active_environments:
|
| 35 |
+
raise HTTPException(
|
| 36 |
+
status_code=status.HTTP_409_CONFLICT,
|
| 37 |
+
detail=f"Episode {episode_id} already exists",
|
| 38 |
+
)
|
| 39 |
+
env = WebScraperEnv(episode_id=episode_id, settings=settings)
|
| 40 |
+
_active_environments[episode_id] = env
|
| 41 |
+
return env
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def remove_environment(episode_id: str) -> bool:
|
| 45 |
+
"""Remove an environment from active storage."""
|
| 46 |
+
if episode_id in _active_environments:
|
| 47 |
+
del _active_environments[episode_id]
|
| 48 |
+
return True
|
| 49 |
+
return False
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def list_environments() -> list[str]:
|
| 53 |
+
"""List all active episode IDs."""
|
| 54 |
+
return list(_active_environments.keys())
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
class DependencyContainer:
|
| 58 |
+
"""Container for dependency injection across the application."""
|
| 59 |
+
|
| 60 |
+
def __init__(self) -> None:
|
| 61 |
+
self._memory_manager: MemoryManager | None = None
|
| 62 |
+
self._model_router: SmartModelRouter | None = None
|
| 63 |
+
self._tool_registry: MCPToolRegistry | None = None
|
| 64 |
+
|
| 65 |
+
def set_memory_manager(self, manager: MemoryManager) -> None:
|
| 66 |
+
"""Set the memory manager instance."""
|
| 67 |
+
self._memory_manager = manager
|
| 68 |
+
|
| 69 |
+
def set_model_router(self, router: SmartModelRouter) -> None:
|
| 70 |
+
"""Set the model router instance."""
|
| 71 |
+
self._model_router = router
|
| 72 |
+
|
| 73 |
+
def set_tool_registry(self, registry: MCPToolRegistry) -> None:
|
| 74 |
+
"""Set the tool registry instance."""
|
| 75 |
+
self._tool_registry = registry
|
| 76 |
+
|
| 77 |
+
def get_memory_manager(self) -> MemoryManager:
|
| 78 |
+
"""Get the memory manager instance."""
|
| 79 |
+
if self._memory_manager is None:
|
| 80 |
+
raise RuntimeError("Memory manager not initialized")
|
| 81 |
+
return self._memory_manager
|
| 82 |
+
|
| 83 |
+
def get_model_router(self) -> SmartModelRouter:
|
| 84 |
+
"""Get the model router instance."""
|
| 85 |
+
if self._model_router is None:
|
| 86 |
+
raise RuntimeError("Model router not initialized")
|
| 87 |
+
return self._model_router
|
| 88 |
+
|
| 89 |
+
def get_tool_registry(self) -> MCPToolRegistry:
|
| 90 |
+
"""Get the tool registry instance."""
|
| 91 |
+
if self._tool_registry is None:
|
| 92 |
+
raise RuntimeError("Tool registry not initialized")
|
| 93 |
+
return self._tool_registry
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
# Global container instance
|
| 97 |
+
container = DependencyContainer()
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
def get_memory_manager() -> MemoryManager:
|
| 101 |
+
"""Dependency for memory manager."""
|
| 102 |
+
return container.get_memory_manager()
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
def get_model_router() -> SmartModelRouter:
|
| 106 |
+
"""Dependency for model router."""
|
| 107 |
+
return container.get_model_router()
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
def get_tool_registry() -> MCPToolRegistry:
|
| 111 |
+
"""Dependency for tool registry."""
|
| 112 |
+
return container.get_tool_registry()
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
# Type aliases for dependency injection
|
| 116 |
+
MemoryManagerDep = Annotated[MemoryManager, Depends(get_memory_manager)]
|
| 117 |
+
ModelRouterDep = Annotated[SmartModelRouter, Depends(get_model_router)]
|
| 118 |
+
ToolRegistryDep = Annotated[MCPToolRegistry, Depends(get_tool_registry)]
|
backend/app/api/routes/__init__.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""API routes package."""
|
| 2 |
+
|
| 3 |
+
from app.api.routes import agents, episode, health, memory, tasks, tools
|
| 4 |
+
|
| 5 |
+
__all__ = ["agents", "episode", "health", "memory", "tasks", "tools"]
|
backend/app/api/routes/__pycache__/__init__.cpython-314.pyc
ADDED
|
Binary file (348 Bytes). View file
|
|
|
backend/app/api/routes/__pycache__/agents.cpython-314.pyc
ADDED
|
Binary file (14.7 kB). View file
|
|
|
backend/app/api/routes/__pycache__/episode.cpython-314.pyc
ADDED
|
Binary file (11.2 kB). View file
|
|
|
backend/app/api/routes/__pycache__/health.cpython-314.pyc
ADDED
|
Binary file (6.35 kB). View file
|
|
|
backend/app/api/routes/__pycache__/memory.cpython-314.pyc
ADDED
|
Binary file (16.5 kB). View file
|
|
|
backend/app/api/routes/__pycache__/tasks.cpython-314.pyc
ADDED
|
Binary file (11.3 kB). View file
|
|
|
backend/app/api/routes/__pycache__/tools.cpython-314.pyc
ADDED
|
Binary file (13.1 kB). View file
|
|
|
backend/app/api/routes/agents.py
ADDED
|
@@ -0,0 +1,301 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Agent management endpoints."""
|
| 2 |
+
|
| 3 |
+
import logging
|
| 4 |
+
from enum import Enum
|
| 5 |
+
from typing import Any
|
| 6 |
+
from uuid import uuid4
|
| 7 |
+
|
| 8 |
+
from fastapi import APIRouter, HTTPException, status
|
| 9 |
+
from pydantic import BaseModel, Field
|
| 10 |
+
|
| 11 |
+
router = APIRouter(prefix="/agents")
|
| 12 |
+
logger = logging.getLogger(__name__)
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class AgentType(str, Enum):
|
| 16 |
+
"""Types of agents in the system."""
|
| 17 |
+
|
| 18 |
+
PLANNER = "planner"
|
| 19 |
+
NAVIGATOR = "navigator"
|
| 20 |
+
EXTRACTOR = "extractor"
|
| 21 |
+
VERIFIER = "verifier"
|
| 22 |
+
MEMORY = "memory"
|
| 23 |
+
COORDINATOR = "coordinator"
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class AgentStatus(str, Enum):
|
| 27 |
+
"""Agent execution status."""
|
| 28 |
+
|
| 29 |
+
IDLE = "idle"
|
| 30 |
+
PLANNING = "planning"
|
| 31 |
+
EXECUTING = "executing"
|
| 32 |
+
WAITING = "waiting"
|
| 33 |
+
COMPLETED = "completed"
|
| 34 |
+
FAILED = "failed"
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
class AgentRunRequest(BaseModel):
|
| 38 |
+
"""Request to run an agent."""
|
| 39 |
+
|
| 40 |
+
agent_type: AgentType
|
| 41 |
+
episode_id: str
|
| 42 |
+
task_context: dict[str, Any] = Field(default_factory=dict)
|
| 43 |
+
observation: dict[str, Any] | None = None
|
| 44 |
+
config: dict[str, Any] = Field(default_factory=dict)
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
class AgentRunResponse(BaseModel):
|
| 48 |
+
"""Response from agent execution."""
|
| 49 |
+
|
| 50 |
+
run_id: str
|
| 51 |
+
agent_type: AgentType
|
| 52 |
+
status: AgentStatus
|
| 53 |
+
action: dict[str, Any] | None = None
|
| 54 |
+
reasoning: str | None = None
|
| 55 |
+
confidence: float | None = None
|
| 56 |
+
tokens_used: int = 0
|
| 57 |
+
execution_time_ms: float = 0.0
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
class PlanRequest(BaseModel):
|
| 61 |
+
"""Request for creating a plan."""
|
| 62 |
+
|
| 63 |
+
episode_id: str
|
| 64 |
+
task_description: str
|
| 65 |
+
current_state: dict[str, Any] = Field(default_factory=dict)
|
| 66 |
+
constraints: list[str] = Field(default_factory=list)
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
class PlanStep(BaseModel):
|
| 70 |
+
"""A single step in a plan."""
|
| 71 |
+
|
| 72 |
+
step_number: int
|
| 73 |
+
action_type: str
|
| 74 |
+
description: str
|
| 75 |
+
agent: AgentType
|
| 76 |
+
dependencies: list[int] = Field(default_factory=list)
|
| 77 |
+
estimated_cost: float = 0.0
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
class PlanResponse(BaseModel):
|
| 81 |
+
"""Response containing a generated plan."""
|
| 82 |
+
|
| 83 |
+
plan_id: str
|
| 84 |
+
episode_id: str
|
| 85 |
+
steps: list[PlanStep]
|
| 86 |
+
total_estimated_steps: int
|
| 87 |
+
reasoning: str
|
| 88 |
+
confidence: float
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
class AgentState(BaseModel):
|
| 92 |
+
"""Current state of an agent."""
|
| 93 |
+
|
| 94 |
+
agent_id: str
|
| 95 |
+
agent_type: AgentType
|
| 96 |
+
status: AgentStatus
|
| 97 |
+
current_task: str | None = None
|
| 98 |
+
messages_processed: int = 0
|
| 99 |
+
actions_taken: int = 0
|
| 100 |
+
last_action: dict[str, Any] | None = None
|
| 101 |
+
memory_snapshot: dict[str, Any] = Field(default_factory=dict)
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
# Store for agent states
|
| 105 |
+
_agent_states: dict[str, AgentState] = {}
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
@router.post(
|
| 109 |
+
"/run",
|
| 110 |
+
response_model=AgentRunResponse,
|
| 111 |
+
status_code=status.HTTP_200_OK,
|
| 112 |
+
summary="Run an agent",
|
| 113 |
+
description="Execute an agent to produce an action",
|
| 114 |
+
)
|
| 115 |
+
async def run_agent(request: AgentRunRequest) -> AgentRunResponse:
|
| 116 |
+
"""
|
| 117 |
+
Run an agent to produce an action for the current observation.
|
| 118 |
+
|
| 119 |
+
Args:
|
| 120 |
+
request: Agent run configuration.
|
| 121 |
+
|
| 122 |
+
Returns:
|
| 123 |
+
AgentRunResponse: Result of agent execution.
|
| 124 |
+
"""
|
| 125 |
+
run_id = str(uuid4())
|
| 126 |
+
logger.info(f"Running {request.agent_type} agent for episode {request.episode_id}")
|
| 127 |
+
|
| 128 |
+
try:
|
| 129 |
+
# Import and instantiate the appropriate agent
|
| 130 |
+
from app.agents.coordinator import AgentCoordinator
|
| 131 |
+
|
| 132 |
+
coordinator = AgentCoordinator()
|
| 133 |
+
result = await coordinator.run_agent(
|
| 134 |
+
agent_type=request.agent_type,
|
| 135 |
+
episode_id=request.episode_id,
|
| 136 |
+
observation=request.observation,
|
| 137 |
+
config=request.config,
|
| 138 |
+
)
|
| 139 |
+
|
| 140 |
+
return AgentRunResponse(
|
| 141 |
+
run_id=run_id,
|
| 142 |
+
agent_type=request.agent_type,
|
| 143 |
+
status=AgentStatus.COMPLETED,
|
| 144 |
+
action=result.get("action"),
|
| 145 |
+
reasoning=result.get("reasoning"),
|
| 146 |
+
confidence=result.get("confidence"),
|
| 147 |
+
tokens_used=result.get("tokens_used", 0),
|
| 148 |
+
execution_time_ms=result.get("execution_time_ms", 0.0),
|
| 149 |
+
)
|
| 150 |
+
except Exception as e:
|
| 151 |
+
logger.error(f"Agent execution failed: {e}")
|
| 152 |
+
return AgentRunResponse(
|
| 153 |
+
run_id=run_id,
|
| 154 |
+
agent_type=request.agent_type,
|
| 155 |
+
status=AgentStatus.FAILED,
|
| 156 |
+
reasoning=str(e),
|
| 157 |
+
)
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
@router.post(
|
| 161 |
+
"/plan",
|
| 162 |
+
response_model=PlanResponse,
|
| 163 |
+
status_code=status.HTTP_200_OK,
|
| 164 |
+
summary="Generate a plan",
|
| 165 |
+
description="Use the planner agent to generate an execution plan",
|
| 166 |
+
)
|
| 167 |
+
async def generate_plan(request: PlanRequest) -> PlanResponse:
|
| 168 |
+
"""
|
| 169 |
+
Generate a plan for completing a task.
|
| 170 |
+
|
| 171 |
+
Args:
|
| 172 |
+
request: Planning request with task details.
|
| 173 |
+
|
| 174 |
+
Returns:
|
| 175 |
+
PlanResponse: Generated plan with steps.
|
| 176 |
+
"""
|
| 177 |
+
plan_id = str(uuid4())
|
| 178 |
+
logger.info(f"Generating plan for episode {request.episode_id}")
|
| 179 |
+
|
| 180 |
+
try:
|
| 181 |
+
from app.agents.planner import PlannerAgent
|
| 182 |
+
|
| 183 |
+
planner = PlannerAgent()
|
| 184 |
+
plan_result = await planner.create_plan(
|
| 185 |
+
task_description=request.task_description,
|
| 186 |
+
current_state=request.current_state,
|
| 187 |
+
constraints=request.constraints,
|
| 188 |
+
)
|
| 189 |
+
|
| 190 |
+
steps = [
|
| 191 |
+
PlanStep(
|
| 192 |
+
step_number=i + 1,
|
| 193 |
+
action_type=step["action_type"],
|
| 194 |
+
description=step["description"],
|
| 195 |
+
agent=AgentType(step["agent"]),
|
| 196 |
+
dependencies=step.get("dependencies", []),
|
| 197 |
+
estimated_cost=step.get("estimated_cost", 0.0),
|
| 198 |
+
)
|
| 199 |
+
for i, step in enumerate(plan_result["steps"])
|
| 200 |
+
]
|
| 201 |
+
|
| 202 |
+
return PlanResponse(
|
| 203 |
+
plan_id=plan_id,
|
| 204 |
+
episode_id=request.episode_id,
|
| 205 |
+
steps=steps,
|
| 206 |
+
total_estimated_steps=len(steps),
|
| 207 |
+
reasoning=plan_result.get("reasoning", ""),
|
| 208 |
+
confidence=plan_result.get("confidence", 0.8),
|
| 209 |
+
)
|
| 210 |
+
except Exception as e:
|
| 211 |
+
logger.error(f"Plan generation failed: {e}")
|
| 212 |
+
raise HTTPException(
|
| 213 |
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
| 214 |
+
detail=f"Failed to generate plan: {str(e)}",
|
| 215 |
+
)
|
| 216 |
+
|
| 217 |
+
|
| 218 |
+
@router.get(
|
| 219 |
+
"/state/{agent_id}",
|
| 220 |
+
response_model=AgentState,
|
| 221 |
+
status_code=status.HTTP_200_OK,
|
| 222 |
+
summary="Get agent state",
|
| 223 |
+
description="Get the current state of an agent",
|
| 224 |
+
)
|
| 225 |
+
async def get_agent_state(agent_id: str) -> AgentState:
|
| 226 |
+
"""
|
| 227 |
+
Get the current state of an agent.
|
| 228 |
+
|
| 229 |
+
Args:
|
| 230 |
+
agent_id: ID of the agent.
|
| 231 |
+
|
| 232 |
+
Returns:
|
| 233 |
+
AgentState: Current agent state.
|
| 234 |
+
"""
|
| 235 |
+
if agent_id not in _agent_states:
|
| 236 |
+
raise HTTPException(
|
| 237 |
+
status_code=status.HTTP_404_NOT_FOUND,
|
| 238 |
+
detail=f"Agent {agent_id} not found",
|
| 239 |
+
)
|
| 240 |
+
return _agent_states[agent_id]
|
| 241 |
+
|
| 242 |
+
|
| 243 |
+
@router.get(
|
| 244 |
+
"/types/",
|
| 245 |
+
status_code=status.HTTP_200_OK,
|
| 246 |
+
summary="Get agent types",
|
| 247 |
+
description="Get all available agent types",
|
| 248 |
+
)
|
| 249 |
+
async def get_agent_types() -> dict[str, list[dict[str, str]]]:
|
| 250 |
+
"""
|
| 251 |
+
Get available agent types with descriptions.
|
| 252 |
+
|
| 253 |
+
Returns:
|
| 254 |
+
Dict with agent type information.
|
| 255 |
+
"""
|
| 256 |
+
agent_info = [
|
| 257 |
+
{"type": AgentType.PLANNER.value, "description": "Creates execution plans for tasks"},
|
| 258 |
+
{"type": AgentType.NAVIGATOR.value, "description": "Handles page navigation and URL management"},
|
| 259 |
+
{"type": AgentType.EXTRACTOR.value, "description": "Extracts data from web pages"},
|
| 260 |
+
{"type": AgentType.VERIFIER.value, "description": "Validates extracted data"},
|
| 261 |
+
{"type": AgentType.MEMORY.value, "description": "Manages memory operations"},
|
| 262 |
+
{"type": AgentType.COORDINATOR.value, "description": "Orchestrates multi-agent collaboration"},
|
| 263 |
+
]
|
| 264 |
+
return {"agents": agent_info}
|
| 265 |
+
|
| 266 |
+
|
| 267 |
+
@router.post(
|
| 268 |
+
"/message",
|
| 269 |
+
status_code=status.HTTP_200_OK,
|
| 270 |
+
summary="Send inter-agent message",
|
| 271 |
+
description="Send a message between agents",
|
| 272 |
+
)
|
| 273 |
+
async def send_agent_message(
|
| 274 |
+
from_agent: str,
|
| 275 |
+
to_agent: str,
|
| 276 |
+
message_type: str,
|
| 277 |
+
content: dict[str, Any],
|
| 278 |
+
) -> dict[str, Any]:
|
| 279 |
+
"""
|
| 280 |
+
Send a message between agents.
|
| 281 |
+
|
| 282 |
+
Args:
|
| 283 |
+
from_agent: Source agent ID.
|
| 284 |
+
to_agent: Target agent ID.
|
| 285 |
+
message_type: Type of message.
|
| 286 |
+
content: Message content.
|
| 287 |
+
|
| 288 |
+
Returns:
|
| 289 |
+
Acknowledgment of message delivery.
|
| 290 |
+
"""
|
| 291 |
+
message_id = str(uuid4())
|
| 292 |
+
logger.info(f"Message {message_id}: {from_agent} -> {to_agent} ({message_type})")
|
| 293 |
+
|
| 294 |
+
# In production, this would go through a message broker
|
| 295 |
+
return {
|
| 296 |
+
"message_id": message_id,
|
| 297 |
+
"status": "delivered",
|
| 298 |
+
"from": from_agent,
|
| 299 |
+
"to": to_agent,
|
| 300 |
+
"type": message_type,
|
| 301 |
+
}
|
backend/app/api/routes/episode.py
ADDED
|
@@ -0,0 +1,235 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Episode management endpoints - reset, step, and state operations."""
|
| 2 |
+
|
| 3 |
+
import logging
|
| 4 |
+
from typing import Any
|
| 5 |
+
from uuid import uuid4
|
| 6 |
+
|
| 7 |
+
from fastapi import APIRouter, HTTPException, status
|
| 8 |
+
from pydantic import BaseModel, Field
|
| 9 |
+
|
| 10 |
+
from app.api.deps import SettingsDep, create_environment, get_environment, remove_environment, list_environments
|
| 11 |
+
from app.core.action import Action, ActionType
|
| 12 |
+
from app.core.observation import Observation
|
| 13 |
+
|
| 14 |
+
router = APIRouter(prefix="/episode")
|
| 15 |
+
logger = logging.getLogger(__name__)
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class ResetRequest(BaseModel):
|
| 19 |
+
"""Request model for resetting an episode."""
|
| 20 |
+
|
| 21 |
+
task_id: str = Field(..., description="ID of the task to execute")
|
| 22 |
+
seed: int | None = Field(default=None, description="Random seed for reproducibility")
|
| 23 |
+
config: dict[str, Any] | None = Field(default=None, description="Episode configuration overrides")
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class ResetResponse(BaseModel):
|
| 27 |
+
"""Response model for episode reset."""
|
| 28 |
+
|
| 29 |
+
episode_id: str
|
| 30 |
+
task_id: str
|
| 31 |
+
observation: Observation
|
| 32 |
+
info: dict[str, Any]
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
class StepRequest(BaseModel):
|
| 36 |
+
"""Request model for taking a step."""
|
| 37 |
+
|
| 38 |
+
episode_id: str = Field(..., description="ID of the episode")
|
| 39 |
+
action: Action = Field(..., description="Action to execute")
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
class StepResponse(BaseModel):
|
| 43 |
+
"""Response model for step execution."""
|
| 44 |
+
|
| 45 |
+
observation: Observation
|
| 46 |
+
reward: float
|
| 47 |
+
reward_breakdown: dict[str, float]
|
| 48 |
+
terminated: bool
|
| 49 |
+
truncated: bool
|
| 50 |
+
info: dict[str, Any]
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
class EpisodeState(BaseModel):
|
| 54 |
+
"""Current state of an episode."""
|
| 55 |
+
|
| 56 |
+
episode_id: str
|
| 57 |
+
task_id: str
|
| 58 |
+
step_number: int
|
| 59 |
+
current_url: str | None
|
| 60 |
+
is_terminal: bool
|
| 61 |
+
total_reward: float
|
| 62 |
+
extracted_data: dict[str, Any]
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
class EpisodeListResponse(BaseModel):
|
| 66 |
+
"""Response model for listing episodes."""
|
| 67 |
+
|
| 68 |
+
episodes: list[str]
|
| 69 |
+
count: int
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
@router.post(
|
| 73 |
+
"/reset",
|
| 74 |
+
response_model=ResetResponse,
|
| 75 |
+
status_code=status.HTTP_201_CREATED,
|
| 76 |
+
summary="Reset/create new episode",
|
| 77 |
+
description="Create a new episode for a given task",
|
| 78 |
+
)
|
| 79 |
+
async def reset_episode(
|
| 80 |
+
request: ResetRequest,
|
| 81 |
+
settings: SettingsDep,
|
| 82 |
+
) -> ResetResponse:
|
| 83 |
+
"""
|
| 84 |
+
Reset and initialize a new episode.
|
| 85 |
+
|
| 86 |
+
Args:
|
| 87 |
+
request: Reset request containing task_id and optional seed.
|
| 88 |
+
settings: Application settings.
|
| 89 |
+
|
| 90 |
+
Returns:
|
| 91 |
+
ResetResponse: New episode ID and initial observation.
|
| 92 |
+
"""
|
| 93 |
+
episode_id = str(uuid4())
|
| 94 |
+
logger.info(f"Creating new episode {episode_id} for task {request.task_id}")
|
| 95 |
+
|
| 96 |
+
try:
|
| 97 |
+
env = create_environment(episode_id, settings)
|
| 98 |
+
observation, info = await env.reset(
|
| 99 |
+
task_id=request.task_id,
|
| 100 |
+
seed=request.seed,
|
| 101 |
+
config=request.config,
|
| 102 |
+
)
|
| 103 |
+
|
| 104 |
+
return ResetResponse(
|
| 105 |
+
episode_id=episode_id,
|
| 106 |
+
task_id=request.task_id,
|
| 107 |
+
observation=observation,
|
| 108 |
+
info=info,
|
| 109 |
+
)
|
| 110 |
+
except Exception as e:
|
| 111 |
+
logger.error(f"Failed to reset episode: {e}")
|
| 112 |
+
remove_environment(episode_id)
|
| 113 |
+
raise HTTPException(
|
| 114 |
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
| 115 |
+
detail=f"Failed to create episode: {str(e)}",
|
| 116 |
+
)
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
@router.post(
|
| 120 |
+
"/step",
|
| 121 |
+
response_model=StepResponse,
|
| 122 |
+
status_code=status.HTTP_200_OK,
|
| 123 |
+
summary="Execute action step",
|
| 124 |
+
description="Execute an action in the episode and receive observation and reward",
|
| 125 |
+
)
|
| 126 |
+
async def step_episode(request: StepRequest) -> StepResponse:
|
| 127 |
+
"""
|
| 128 |
+
Execute an action step in the episode.
|
| 129 |
+
|
| 130 |
+
Args:
|
| 131 |
+
request: Step request containing episode_id and action.
|
| 132 |
+
|
| 133 |
+
Returns:
|
| 134 |
+
StepResponse: New observation, reward, and termination status.
|
| 135 |
+
"""
|
| 136 |
+
logger.info(f"Step in episode {request.episode_id}: {request.action.action_type}")
|
| 137 |
+
|
| 138 |
+
env = get_environment(request.episode_id)
|
| 139 |
+
|
| 140 |
+
try:
|
| 141 |
+
observation, reward, reward_breakdown, terminated, truncated, info = await env.step(
|
| 142 |
+
request.action
|
| 143 |
+
)
|
| 144 |
+
|
| 145 |
+
# Clean up if episode is done
|
| 146 |
+
if terminated or truncated:
|
| 147 |
+
logger.info(f"Episode {request.episode_id} completed")
|
| 148 |
+
|
| 149 |
+
return StepResponse(
|
| 150 |
+
observation=observation,
|
| 151 |
+
reward=reward,
|
| 152 |
+
reward_breakdown=reward_breakdown,
|
| 153 |
+
terminated=terminated,
|
| 154 |
+
truncated=truncated,
|
| 155 |
+
info=info,
|
| 156 |
+
)
|
| 157 |
+
except Exception as e:
|
| 158 |
+
logger.error(f"Step failed: {e}")
|
| 159 |
+
raise HTTPException(
|
| 160 |
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
| 161 |
+
detail=f"Step execution failed: {str(e)}",
|
| 162 |
+
)
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
@router.get(
|
| 166 |
+
"/state/{episode_id}",
|
| 167 |
+
response_model=EpisodeState,
|
| 168 |
+
status_code=status.HTTP_200_OK,
|
| 169 |
+
summary="Get episode state",
|
| 170 |
+
description="Get the current state of an episode",
|
| 171 |
+
)
|
| 172 |
+
async def get_episode_state(episode_id: str) -> EpisodeState:
|
| 173 |
+
"""
|
| 174 |
+
Get the current state of an episode.
|
| 175 |
+
|
| 176 |
+
Args:
|
| 177 |
+
episode_id: ID of the episode.
|
| 178 |
+
|
| 179 |
+
Returns:
|
| 180 |
+
EpisodeState: Current episode state.
|
| 181 |
+
"""
|
| 182 |
+
env = get_environment(episode_id)
|
| 183 |
+
state = env.get_state()
|
| 184 |
+
|
| 185 |
+
return EpisodeState(
|
| 186 |
+
episode_id=episode_id,
|
| 187 |
+
task_id=state["task_id"],
|
| 188 |
+
step_number=state["step_number"],
|
| 189 |
+
current_url=state["current_url"],
|
| 190 |
+
is_terminal=state["is_terminal"],
|
| 191 |
+
total_reward=state["total_reward"],
|
| 192 |
+
extracted_data=state["extracted_data"],
|
| 193 |
+
)
|
| 194 |
+
|
| 195 |
+
|
| 196 |
+
@router.delete(
|
| 197 |
+
"/{episode_id}",
|
| 198 |
+
status_code=status.HTTP_204_NO_CONTENT,
|
| 199 |
+
summary="Delete episode",
|
| 200 |
+
description="Clean up and delete an episode",
|
| 201 |
+
)
|
| 202 |
+
async def delete_episode(episode_id: str) -> None:
|
| 203 |
+
"""
|
| 204 |
+
Delete an episode and clean up resources.
|
| 205 |
+
|
| 206 |
+
Args:
|
| 207 |
+
episode_id: ID of the episode to delete.
|
| 208 |
+
"""
|
| 209 |
+
if not remove_environment(episode_id):
|
| 210 |
+
raise HTTPException(
|
| 211 |
+
status_code=status.HTTP_404_NOT_FOUND,
|
| 212 |
+
detail=f"Episode {episode_id} not found",
|
| 213 |
+
)
|
| 214 |
+
logger.info(f"Deleted episode {episode_id}")
|
| 215 |
+
|
| 216 |
+
|
| 217 |
+
@router.get(
|
| 218 |
+
"/",
|
| 219 |
+
response_model=EpisodeListResponse,
|
| 220 |
+
status_code=status.HTTP_200_OK,
|
| 221 |
+
summary="List episodes",
|
| 222 |
+
description="List all active episodes",
|
| 223 |
+
)
|
| 224 |
+
async def list_episodes() -> EpisodeListResponse:
|
| 225 |
+
"""
|
| 226 |
+
List all active episodes.
|
| 227 |
+
|
| 228 |
+
Returns:
|
| 229 |
+
EpisodeListResponse: List of active episode IDs.
|
| 230 |
+
"""
|
| 231 |
+
episodes = list_environments()
|
| 232 |
+
return EpisodeListResponse(
|
| 233 |
+
episodes=episodes,
|
| 234 |
+
count=len(episodes),
|
| 235 |
+
)
|
backend/app/api/routes/health.py
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Health check endpoints."""
|
| 2 |
+
|
| 3 |
+
import logging
|
| 4 |
+
from datetime import datetime, timezone
|
| 5 |
+
from typing import Any
|
| 6 |
+
|
| 7 |
+
from fastapi import APIRouter, status
|
| 8 |
+
from pydantic import BaseModel
|
| 9 |
+
|
| 10 |
+
from app.config import get_settings
|
| 11 |
+
|
| 12 |
+
router = APIRouter()
|
| 13 |
+
logger = logging.getLogger(__name__)
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class HealthResponse(BaseModel):
|
| 17 |
+
"""Health check response model."""
|
| 18 |
+
|
| 19 |
+
status: str
|
| 20 |
+
timestamp: str
|
| 21 |
+
version: str
|
| 22 |
+
uptime_seconds: float | None = None
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
class ReadyResponse(BaseModel):
|
| 26 |
+
"""Readiness check response model."""
|
| 27 |
+
|
| 28 |
+
ready: bool
|
| 29 |
+
checks: dict[str, bool]
|
| 30 |
+
details: dict[str, Any] | None = None
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
# Track startup time
|
| 34 |
+
_startup_time: datetime | None = None
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def set_startup_time() -> None:
|
| 38 |
+
"""Set the startup time for uptime calculation."""
|
| 39 |
+
global _startup_time
|
| 40 |
+
_startup_time = datetime.now(timezone.utc)
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
@router.get(
|
| 44 |
+
"/health",
|
| 45 |
+
response_model=HealthResponse,
|
| 46 |
+
status_code=status.HTTP_200_OK,
|
| 47 |
+
summary="Health check",
|
| 48 |
+
description="Basic health check endpoint",
|
| 49 |
+
)
|
| 50 |
+
async def health_check() -> HealthResponse:
|
| 51 |
+
"""
|
| 52 |
+
Perform a basic health check.
|
| 53 |
+
|
| 54 |
+
Returns:
|
| 55 |
+
HealthResponse: Current health status of the application.
|
| 56 |
+
"""
|
| 57 |
+
settings = get_settings()
|
| 58 |
+
now = datetime.now(timezone.utc)
|
| 59 |
+
|
| 60 |
+
uptime = None
|
| 61 |
+
if _startup_time:
|
| 62 |
+
uptime = (now - _startup_time).total_seconds()
|
| 63 |
+
|
| 64 |
+
return HealthResponse(
|
| 65 |
+
status="healthy",
|
| 66 |
+
timestamp=now.isoformat(),
|
| 67 |
+
version=settings.app_version,
|
| 68 |
+
uptime_seconds=uptime,
|
| 69 |
+
)
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
@router.get(
|
| 73 |
+
"/ready",
|
| 74 |
+
response_model=ReadyResponse,
|
| 75 |
+
status_code=status.HTTP_200_OK,
|
| 76 |
+
summary="Readiness check",
|
| 77 |
+
description="Check if the application is ready to serve requests",
|
| 78 |
+
)
|
| 79 |
+
async def readiness_check() -> ReadyResponse:
|
| 80 |
+
"""
|
| 81 |
+
Perform a readiness check.
|
| 82 |
+
|
| 83 |
+
Checks:
|
| 84 |
+
- Memory manager availability
|
| 85 |
+
- Model router availability
|
| 86 |
+
- Tool registry availability
|
| 87 |
+
|
| 88 |
+
Returns:
|
| 89 |
+
ReadyResponse: Readiness status with individual check results.
|
| 90 |
+
"""
|
| 91 |
+
checks: dict[str, bool] = {}
|
| 92 |
+
details: dict[str, Any] = {}
|
| 93 |
+
|
| 94 |
+
# Check memory manager
|
| 95 |
+
try:
|
| 96 |
+
from app.main import get_memory_manager
|
| 97 |
+
memory_manager = get_memory_manager()
|
| 98 |
+
checks["memory_manager"] = memory_manager is not None
|
| 99 |
+
except Exception as e:
|
| 100 |
+
checks["memory_manager"] = False
|
| 101 |
+
details["memory_manager_error"] = str(e)
|
| 102 |
+
|
| 103 |
+
# Check model router
|
| 104 |
+
try:
|
| 105 |
+
from app.main import get_model_router
|
| 106 |
+
model_router = get_model_router()
|
| 107 |
+
checks["model_router"] = model_router is not None
|
| 108 |
+
if model_router:
|
| 109 |
+
details["available_providers"] = model_router.list_providers()
|
| 110 |
+
except Exception as e:
|
| 111 |
+
checks["model_router"] = False
|
| 112 |
+
details["model_router_error"] = str(e)
|
| 113 |
+
|
| 114 |
+
# Check tool registry
|
| 115 |
+
try:
|
| 116 |
+
from app.main import get_tool_registry
|
| 117 |
+
tool_registry = get_tool_registry()
|
| 118 |
+
checks["tool_registry"] = tool_registry is not None
|
| 119 |
+
if tool_registry:
|
| 120 |
+
details["registered_tools"] = len(tool_registry.list_tools())
|
| 121 |
+
except Exception as e:
|
| 122 |
+
checks["tool_registry"] = False
|
| 123 |
+
details["tool_registry_error"] = str(e)
|
| 124 |
+
|
| 125 |
+
all_ready = all(checks.values())
|
| 126 |
+
|
| 127 |
+
return ReadyResponse(
|
| 128 |
+
ready=all_ready,
|
| 129 |
+
checks=checks,
|
| 130 |
+
details=details if details else None,
|
| 131 |
+
)
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
@router.get(
|
| 135 |
+
"/ping",
|
| 136 |
+
status_code=status.HTTP_200_OK,
|
| 137 |
+
summary="Ping endpoint",
|
| 138 |
+
description="Simple ping endpoint for load balancers",
|
| 139 |
+
)
|
| 140 |
+
async def ping() -> dict[str, str]:
|
| 141 |
+
"""Simple ping endpoint."""
|
| 142 |
+
return {"ping": "pong"}
|
backend/app/api/routes/memory.py
ADDED
|
@@ -0,0 +1,344 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Memory management endpoints."""
|
| 2 |
+
|
| 3 |
+
import logging
|
| 4 |
+
from datetime import datetime, timezone
|
| 5 |
+
from enum import Enum
|
| 6 |
+
from typing import Any
|
| 7 |
+
from uuid import uuid4
|
| 8 |
+
|
| 9 |
+
from fastapi import APIRouter, HTTPException, status
|
| 10 |
+
from pydantic import BaseModel, Field
|
| 11 |
+
|
| 12 |
+
router = APIRouter(prefix="/memory")
|
| 13 |
+
logger = logging.getLogger(__name__)
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class MemoryType(str, Enum):
|
| 17 |
+
"""Types of memory layers."""
|
| 18 |
+
|
| 19 |
+
SHORT_TERM = "short_term"
|
| 20 |
+
WORKING = "working"
|
| 21 |
+
LONG_TERM = "long_term"
|
| 22 |
+
SHARED = "shared"
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
class MemoryEntry(BaseModel):
|
| 26 |
+
"""A single memory entry."""
|
| 27 |
+
|
| 28 |
+
id: str
|
| 29 |
+
memory_type: MemoryType
|
| 30 |
+
content: dict[str, Any]
|
| 31 |
+
metadata: dict[str, Any] = Field(default_factory=dict)
|
| 32 |
+
timestamp: str
|
| 33 |
+
episode_id: str | None = None
|
| 34 |
+
agent_id: str | None = None
|
| 35 |
+
relevance_score: float | None = None
|
| 36 |
+
embedding: list[float] | None = None
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
class MemoryQueryRequest(BaseModel):
|
| 40 |
+
"""Request for querying memory."""
|
| 41 |
+
|
| 42 |
+
query: str
|
| 43 |
+
memory_types: list[MemoryType] = Field(default_factory=lambda: list(MemoryType))
|
| 44 |
+
episode_id: str | None = None
|
| 45 |
+
limit: int = 10
|
| 46 |
+
min_relevance: float = 0.0
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
class MemoryQueryResponse(BaseModel):
|
| 50 |
+
"""Response from memory query."""
|
| 51 |
+
|
| 52 |
+
entries: list[MemoryEntry]
|
| 53 |
+
total_found: int
|
| 54 |
+
query: str
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
class MemoryStoreRequest(BaseModel):
|
| 58 |
+
"""Request to store a memory entry."""
|
| 59 |
+
|
| 60 |
+
memory_type: MemoryType
|
| 61 |
+
content: dict[str, Any]
|
| 62 |
+
metadata: dict[str, Any] = Field(default_factory=dict)
|
| 63 |
+
episode_id: str | None = None
|
| 64 |
+
agent_id: str | None = None
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
class MemoryStats(BaseModel):
|
| 68 |
+
"""Statistics about memory usage."""
|
| 69 |
+
|
| 70 |
+
short_term_count: int
|
| 71 |
+
working_count: int
|
| 72 |
+
long_term_count: int
|
| 73 |
+
shared_count: int
|
| 74 |
+
total_count: int
|
| 75 |
+
oldest_entry: str | None = None
|
| 76 |
+
newest_entry: str | None = None
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
# In-memory storage (would use actual memory layers in production)
|
| 80 |
+
_memory_store: dict[str, MemoryEntry] = {}
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
@router.post(
|
| 84 |
+
"/store",
|
| 85 |
+
response_model=MemoryEntry,
|
| 86 |
+
status_code=status.HTTP_201_CREATED,
|
| 87 |
+
summary="Store memory entry",
|
| 88 |
+
description="Store a new memory entry",
|
| 89 |
+
)
|
| 90 |
+
async def store_memory(request: MemoryStoreRequest) -> MemoryEntry:
|
| 91 |
+
"""
|
| 92 |
+
Store a new memory entry.
|
| 93 |
+
|
| 94 |
+
Args:
|
| 95 |
+
request: Memory storage request.
|
| 96 |
+
|
| 97 |
+
Returns:
|
| 98 |
+
MemoryEntry: Stored memory entry.
|
| 99 |
+
"""
|
| 100 |
+
entry_id = str(uuid4())
|
| 101 |
+
timestamp = datetime.now(timezone.utc).isoformat()
|
| 102 |
+
|
| 103 |
+
entry = MemoryEntry(
|
| 104 |
+
id=entry_id,
|
| 105 |
+
memory_type=request.memory_type,
|
| 106 |
+
content=request.content,
|
| 107 |
+
metadata=request.metadata,
|
| 108 |
+
timestamp=timestamp,
|
| 109 |
+
episode_id=request.episode_id,
|
| 110 |
+
agent_id=request.agent_id,
|
| 111 |
+
)
|
| 112 |
+
|
| 113 |
+
_memory_store[entry_id] = entry
|
| 114 |
+
logger.info(f"Stored memory entry {entry_id} ({request.memory_type})")
|
| 115 |
+
|
| 116 |
+
return entry
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
@router.post(
|
| 120 |
+
"/query",
|
| 121 |
+
response_model=MemoryQueryResponse,
|
| 122 |
+
status_code=status.HTTP_200_OK,
|
| 123 |
+
summary="Query memory",
|
| 124 |
+
description="Query memory entries by semantic similarity or filters",
|
| 125 |
+
)
|
| 126 |
+
async def query_memory(request: MemoryQueryRequest) -> MemoryQueryResponse:
|
| 127 |
+
"""
|
| 128 |
+
Query memory entries.
|
| 129 |
+
|
| 130 |
+
Args:
|
| 131 |
+
request: Memory query request.
|
| 132 |
+
|
| 133 |
+
Returns:
|
| 134 |
+
MemoryQueryResponse: Matching memory entries.
|
| 135 |
+
"""
|
| 136 |
+
logger.info(f"Querying memory: '{request.query[:50]}...'")
|
| 137 |
+
|
| 138 |
+
# Filter entries
|
| 139 |
+
entries = list(_memory_store.values())
|
| 140 |
+
|
| 141 |
+
# Filter by memory type
|
| 142 |
+
if request.memory_types:
|
| 143 |
+
entries = [e for e in entries if e.memory_type in request.memory_types]
|
| 144 |
+
|
| 145 |
+
# Filter by episode
|
| 146 |
+
if request.episode_id:
|
| 147 |
+
entries = [e for e in entries if e.episode_id == request.episode_id]
|
| 148 |
+
|
| 149 |
+
# Simple text matching (would use embeddings in production)
|
| 150 |
+
query_lower = request.query.lower()
|
| 151 |
+
scored_entries = []
|
| 152 |
+
for entry in entries:
|
| 153 |
+
content_str = str(entry.content).lower()
|
| 154 |
+
if query_lower in content_str:
|
| 155 |
+
score = content_str.count(query_lower) / len(content_str.split())
|
| 156 |
+
entry.relevance_score = min(score * 10, 1.0)
|
| 157 |
+
if entry.relevance_score >= request.min_relevance:
|
| 158 |
+
scored_entries.append(entry)
|
| 159 |
+
|
| 160 |
+
# Sort by relevance and limit
|
| 161 |
+
scored_entries.sort(key=lambda e: e.relevance_score or 0, reverse=True)
|
| 162 |
+
result_entries = scored_entries[: request.limit]
|
| 163 |
+
|
| 164 |
+
return MemoryQueryResponse(
|
| 165 |
+
entries=result_entries,
|
| 166 |
+
total_found=len(scored_entries),
|
| 167 |
+
query=request.query,
|
| 168 |
+
)
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
@router.get(
|
| 172 |
+
"/{entry_id}",
|
| 173 |
+
response_model=MemoryEntry,
|
| 174 |
+
status_code=status.HTTP_200_OK,
|
| 175 |
+
summary="Get memory entry",
|
| 176 |
+
description="Get a specific memory entry by ID",
|
| 177 |
+
)
|
| 178 |
+
async def get_memory_entry(entry_id: str) -> MemoryEntry:
|
| 179 |
+
"""
|
| 180 |
+
Get a specific memory entry.
|
| 181 |
+
|
| 182 |
+
Args:
|
| 183 |
+
entry_id: ID of the memory entry.
|
| 184 |
+
|
| 185 |
+
Returns:
|
| 186 |
+
MemoryEntry: The memory entry.
|
| 187 |
+
"""
|
| 188 |
+
if entry_id not in _memory_store:
|
| 189 |
+
raise HTTPException(
|
| 190 |
+
status_code=status.HTTP_404_NOT_FOUND,
|
| 191 |
+
detail=f"Memory entry {entry_id} not found",
|
| 192 |
+
)
|
| 193 |
+
return _memory_store[entry_id]
|
| 194 |
+
|
| 195 |
+
|
| 196 |
+
@router.put(
|
| 197 |
+
"/{entry_id}",
|
| 198 |
+
response_model=MemoryEntry,
|
| 199 |
+
status_code=status.HTTP_200_OK,
|
| 200 |
+
summary="Update memory entry",
|
| 201 |
+
description="Update an existing memory entry",
|
| 202 |
+
)
|
| 203 |
+
async def update_memory_entry(
|
| 204 |
+
entry_id: str,
|
| 205 |
+
content: dict[str, Any],
|
| 206 |
+
metadata: dict[str, Any] | None = None,
|
| 207 |
+
) -> MemoryEntry:
|
| 208 |
+
"""
|
| 209 |
+
Update a memory entry.
|
| 210 |
+
|
| 211 |
+
Args:
|
| 212 |
+
entry_id: ID of the entry to update.
|
| 213 |
+
content: New content.
|
| 214 |
+
metadata: Optional new metadata.
|
| 215 |
+
|
| 216 |
+
Returns:
|
| 217 |
+
MemoryEntry: Updated entry.
|
| 218 |
+
"""
|
| 219 |
+
if entry_id not in _memory_store:
|
| 220 |
+
raise HTTPException(
|
| 221 |
+
status_code=status.HTTP_404_NOT_FOUND,
|
| 222 |
+
detail=f"Memory entry {entry_id} not found",
|
| 223 |
+
)
|
| 224 |
+
|
| 225 |
+
entry = _memory_store[entry_id]
|
| 226 |
+
entry.content = content
|
| 227 |
+
if metadata:
|
| 228 |
+
entry.metadata.update(metadata)
|
| 229 |
+
entry.timestamp = datetime.now(timezone.utc).isoformat()
|
| 230 |
+
|
| 231 |
+
logger.info(f"Updated memory entry {entry_id}")
|
| 232 |
+
return entry
|
| 233 |
+
|
| 234 |
+
|
| 235 |
+
@router.delete(
|
| 236 |
+
"/{entry_id}",
|
| 237 |
+
status_code=status.HTTP_204_NO_CONTENT,
|
| 238 |
+
summary="Delete memory entry",
|
| 239 |
+
description="Delete a memory entry",
|
| 240 |
+
)
|
| 241 |
+
async def delete_memory_entry(entry_id: str) -> None:
|
| 242 |
+
"""
|
| 243 |
+
Delete a memory entry.
|
| 244 |
+
|
| 245 |
+
Args:
|
| 246 |
+
entry_id: ID of the entry to delete.
|
| 247 |
+
"""
|
| 248 |
+
if entry_id not in _memory_store:
|
| 249 |
+
raise HTTPException(
|
| 250 |
+
status_code=status.HTTP_404_NOT_FOUND,
|
| 251 |
+
detail=f"Memory entry {entry_id} not found",
|
| 252 |
+
)
|
| 253 |
+
|
| 254 |
+
del _memory_store[entry_id]
|
| 255 |
+
logger.info(f"Deleted memory entry {entry_id}")
|
| 256 |
+
|
| 257 |
+
|
| 258 |
+
@router.get(
|
| 259 |
+
"/stats/overview",
|
| 260 |
+
response_model=MemoryStats,
|
| 261 |
+
status_code=status.HTTP_200_OK,
|
| 262 |
+
summary="Get memory stats",
|
| 263 |
+
description="Get statistics about memory usage",
|
| 264 |
+
)
|
| 265 |
+
async def get_memory_stats() -> MemoryStats:
|
| 266 |
+
"""
|
| 267 |
+
Get memory statistics.
|
| 268 |
+
|
| 269 |
+
Returns:
|
| 270 |
+
MemoryStats: Memory usage statistics.
|
| 271 |
+
"""
|
| 272 |
+
entries = list(_memory_store.values())
|
| 273 |
+
|
| 274 |
+
counts = {mt: 0 for mt in MemoryType}
|
| 275 |
+
for entry in entries:
|
| 276 |
+
counts[entry.memory_type] += 1
|
| 277 |
+
|
| 278 |
+
timestamps = [e.timestamp for e in entries]
|
| 279 |
+
|
| 280 |
+
return MemoryStats(
|
| 281 |
+
short_term_count=counts[MemoryType.SHORT_TERM],
|
| 282 |
+
working_count=counts[MemoryType.WORKING],
|
| 283 |
+
long_term_count=counts[MemoryType.LONG_TERM],
|
| 284 |
+
shared_count=counts[MemoryType.SHARED],
|
| 285 |
+
total_count=len(entries),
|
| 286 |
+
oldest_entry=min(timestamps) if timestamps else None,
|
| 287 |
+
newest_entry=max(timestamps) if timestamps else None,
|
| 288 |
+
)
|
| 289 |
+
|
| 290 |
+
|
| 291 |
+
@router.delete(
|
| 292 |
+
"/clear/{memory_type}",
|
| 293 |
+
status_code=status.HTTP_204_NO_CONTENT,
|
| 294 |
+
summary="Clear memory layer",
|
| 295 |
+
description="Clear all entries from a memory layer",
|
| 296 |
+
)
|
| 297 |
+
async def clear_memory_layer(memory_type: MemoryType) -> None:
|
| 298 |
+
"""
|
| 299 |
+
Clear all entries from a memory layer.
|
| 300 |
+
|
| 301 |
+
Args:
|
| 302 |
+
memory_type: Type of memory to clear.
|
| 303 |
+
"""
|
| 304 |
+
global _memory_store
|
| 305 |
+
to_delete = [k for k, v in _memory_store.items() if v.memory_type == memory_type]
|
| 306 |
+
for key in to_delete:
|
| 307 |
+
del _memory_store[key]
|
| 308 |
+
logger.info(f"Cleared {len(to_delete)} entries from {memory_type}")
|
| 309 |
+
|
| 310 |
+
|
| 311 |
+
@router.post(
|
| 312 |
+
"/consolidate",
|
| 313 |
+
status_code=status.HTTP_200_OK,
|
| 314 |
+
summary="Consolidate memory",
|
| 315 |
+
description="Consolidate short-term memory into long-term memory",
|
| 316 |
+
)
|
| 317 |
+
async def consolidate_memory(episode_id: str | None = None) -> dict[str, Any]:
|
| 318 |
+
"""
|
| 319 |
+
Consolidate memory from short-term to long-term.
|
| 320 |
+
|
| 321 |
+
Args:
|
| 322 |
+
episode_id: Optional episode to consolidate.
|
| 323 |
+
|
| 324 |
+
Returns:
|
| 325 |
+
Consolidation result.
|
| 326 |
+
"""
|
| 327 |
+
entries = list(_memory_store.values())
|
| 328 |
+
|
| 329 |
+
if episode_id:
|
| 330 |
+
entries = [e for e in entries if e.episode_id == episode_id]
|
| 331 |
+
|
| 332 |
+
short_term = [e for e in entries if e.memory_type == MemoryType.SHORT_TERM]
|
| 333 |
+
|
| 334 |
+
consolidated = 0
|
| 335 |
+
for entry in short_term:
|
| 336 |
+
entry.memory_type = MemoryType.LONG_TERM
|
| 337 |
+
consolidated += 1
|
| 338 |
+
|
| 339 |
+
logger.info(f"Consolidated {consolidated} entries to long-term memory")
|
| 340 |
+
|
| 341 |
+
return {
|
| 342 |
+
"consolidated_count": consolidated,
|
| 343 |
+
"episode_id": episode_id,
|
| 344 |
+
}
|
backend/app/api/routes/tasks.py
ADDED
|
@@ -0,0 +1,254 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tasks management endpoints."""
|
| 2 |
+
|
| 3 |
+
import logging
|
| 4 |
+
from enum import Enum
|
| 5 |
+
from typing import Any
|
| 6 |
+
|
| 7 |
+
from fastapi import APIRouter, HTTPException, status
|
| 8 |
+
from pydantic import BaseModel, Field
|
| 9 |
+
|
| 10 |
+
router = APIRouter(prefix="/tasks")
|
| 11 |
+
logger = logging.getLogger(__name__)
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class TaskDifficulty(str, Enum):
|
| 15 |
+
"""Task difficulty levels."""
|
| 16 |
+
|
| 17 |
+
EASY = "easy"
|
| 18 |
+
MEDIUM = "medium"
|
| 19 |
+
HARD = "hard"
|
| 20 |
+
EXPERT = "expert"
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
class TaskType(str, Enum):
|
| 24 |
+
"""Types of scraping tasks."""
|
| 25 |
+
|
| 26 |
+
SINGLE_PAGE = "single_page"
|
| 27 |
+
MULTI_PAGE = "multi_page"
|
| 28 |
+
SEARCH_EXTRACT = "search_extract"
|
| 29 |
+
FORM_FILL = "form_fill"
|
| 30 |
+
DYNAMIC_CONTENT = "dynamic_content"
|
| 31 |
+
AUTHENTICATION = "authentication"
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
class FieldSchema(BaseModel):
|
| 35 |
+
"""Schema for a field to extract."""
|
| 36 |
+
|
| 37 |
+
name: str
|
| 38 |
+
description: str
|
| 39 |
+
field_type: str = "string"
|
| 40 |
+
required: bool = True
|
| 41 |
+
validation_pattern: str | None = None
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
class Task(BaseModel):
|
| 45 |
+
"""A scraping task definition."""
|
| 46 |
+
|
| 47 |
+
id: str
|
| 48 |
+
name: str
|
| 49 |
+
description: str
|
| 50 |
+
task_type: TaskType
|
| 51 |
+
difficulty: TaskDifficulty
|
| 52 |
+
target_url: str | None = None
|
| 53 |
+
target_domain: str | None = None
|
| 54 |
+
fields_to_extract: list[FieldSchema]
|
| 55 |
+
success_criteria: dict[str, Any]
|
| 56 |
+
hints: list[str] = Field(default_factory=list)
|
| 57 |
+
max_steps: int = 50
|
| 58 |
+
timeout_seconds: float = 300.0
|
| 59 |
+
tags: list[str] = Field(default_factory=list)
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
class TaskListResponse(BaseModel):
|
| 63 |
+
"""Response for listing tasks."""
|
| 64 |
+
|
| 65 |
+
tasks: list[Task]
|
| 66 |
+
total: int
|
| 67 |
+
page: int
|
| 68 |
+
page_size: int
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
class TaskProgress(BaseModel):
|
| 72 |
+
"""Progress on a task within an episode."""
|
| 73 |
+
|
| 74 |
+
task_id: str
|
| 75 |
+
fields_extracted: int
|
| 76 |
+
fields_total: int
|
| 77 |
+
steps_taken: int
|
| 78 |
+
max_steps: int
|
| 79 |
+
accuracy_estimate: float
|
| 80 |
+
completion_percentage: float
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
# Sample task repository (would be database-backed in production)
|
| 84 |
+
TASK_REPOSITORY: dict[str, Task] = {
|
| 85 |
+
"task_001": Task(
|
| 86 |
+
id="task_001",
|
| 87 |
+
name="Extract Product Details",
|
| 88 |
+
description="Extract product name, price, and description from an e-commerce page",
|
| 89 |
+
task_type=TaskType.SINGLE_PAGE,
|
| 90 |
+
difficulty=TaskDifficulty.EASY,
|
| 91 |
+
target_url="https://example.com/product/123",
|
| 92 |
+
fields_to_extract=[
|
| 93 |
+
FieldSchema(name="product_name", description="The name of the product"),
|
| 94 |
+
FieldSchema(name="price", description="Current price", field_type="number"),
|
| 95 |
+
FieldSchema(name="description", description="Product description"),
|
| 96 |
+
],
|
| 97 |
+
success_criteria={"min_accuracy": 0.9, "required_fields": ["product_name", "price"]},
|
| 98 |
+
hints=["Look for h1 tags for product name", "Price often in span with class containing 'price'"],
|
| 99 |
+
tags=["ecommerce", "product"],
|
| 100 |
+
),
|
| 101 |
+
"task_002": Task(
|
| 102 |
+
id="task_002",
|
| 103 |
+
name="Search and Extract Company Info",
|
| 104 |
+
description="Search for a company and extract key information from search results",
|
| 105 |
+
task_type=TaskType.SEARCH_EXTRACT,
|
| 106 |
+
difficulty=TaskDifficulty.MEDIUM,
|
| 107 |
+
target_domain="linkedin.com",
|
| 108 |
+
fields_to_extract=[
|
| 109 |
+
FieldSchema(name="company_name", description="Official company name"),
|
| 110 |
+
FieldSchema(name="industry", description="Primary industry"),
|
| 111 |
+
FieldSchema(name="employee_count", description="Number of employees", field_type="string"),
|
| 112 |
+
FieldSchema(name="headquarters", description="Location of headquarters"),
|
| 113 |
+
],
|
| 114 |
+
success_criteria={"min_accuracy": 0.8, "required_fields": ["company_name", "industry"]},
|
| 115 |
+
tags=["search", "company", "linkedin"],
|
| 116 |
+
max_steps=30,
|
| 117 |
+
),
|
| 118 |
+
"task_003": Task(
|
| 119 |
+
id="task_003",
|
| 120 |
+
name="Multi-page Article Extraction",
|
| 121 |
+
description="Navigate through paginated articles and extract all content",
|
| 122 |
+
task_type=TaskType.MULTI_PAGE,
|
| 123 |
+
difficulty=TaskDifficulty.HARD,
|
| 124 |
+
target_domain="news-site.example.com",
|
| 125 |
+
fields_to_extract=[
|
| 126 |
+
FieldSchema(name="articles", description="List of article data", field_type="array"),
|
| 127 |
+
],
|
| 128 |
+
success_criteria={"min_articles": 10, "min_accuracy": 0.85},
|
| 129 |
+
tags=["pagination", "articles", "news"],
|
| 130 |
+
max_steps=100,
|
| 131 |
+
),
|
| 132 |
+
}
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
@router.get(
|
| 136 |
+
"/",
|
| 137 |
+
response_model=TaskListResponse,
|
| 138 |
+
status_code=status.HTTP_200_OK,
|
| 139 |
+
summary="List available tasks",
|
| 140 |
+
description="Get a paginated list of available scraping tasks",
|
| 141 |
+
)
|
| 142 |
+
async def list_tasks(
|
| 143 |
+
page: int = 1,
|
| 144 |
+
page_size: int = 20,
|
| 145 |
+
difficulty: TaskDifficulty | None = None,
|
| 146 |
+
task_type: TaskType | None = None,
|
| 147 |
+
tag: str | None = None,
|
| 148 |
+
) -> TaskListResponse:
|
| 149 |
+
"""
|
| 150 |
+
List available tasks with optional filtering.
|
| 151 |
+
|
| 152 |
+
Args:
|
| 153 |
+
page: Page number (1-indexed).
|
| 154 |
+
page_size: Number of tasks per page.
|
| 155 |
+
difficulty: Filter by difficulty level.
|
| 156 |
+
task_type: Filter by task type.
|
| 157 |
+
tag: Filter by tag.
|
| 158 |
+
|
| 159 |
+
Returns:
|
| 160 |
+
TaskListResponse: Paginated list of tasks.
|
| 161 |
+
"""
|
| 162 |
+
tasks = list(TASK_REPOSITORY.values())
|
| 163 |
+
|
| 164 |
+
# Apply filters
|
| 165 |
+
if difficulty:
|
| 166 |
+
tasks = [t for t in tasks if t.difficulty == difficulty]
|
| 167 |
+
if task_type:
|
| 168 |
+
tasks = [t for t in tasks if t.task_type == task_type]
|
| 169 |
+
if tag:
|
| 170 |
+
tasks = [t for t in tasks if tag in t.tags]
|
| 171 |
+
|
| 172 |
+
# Paginate
|
| 173 |
+
total = len(tasks)
|
| 174 |
+
start = (page - 1) * page_size
|
| 175 |
+
end = start + page_size
|
| 176 |
+
paginated_tasks = tasks[start:end]
|
| 177 |
+
|
| 178 |
+
return TaskListResponse(
|
| 179 |
+
tasks=paginated_tasks,
|
| 180 |
+
total=total,
|
| 181 |
+
page=page,
|
| 182 |
+
page_size=page_size,
|
| 183 |
+
)
|
| 184 |
+
|
| 185 |
+
|
| 186 |
+
@router.get(
|
| 187 |
+
"/{task_id}",
|
| 188 |
+
response_model=Task,
|
| 189 |
+
status_code=status.HTTP_200_OK,
|
| 190 |
+
summary="Get task details",
|
| 191 |
+
description="Get details of a specific task",
|
| 192 |
+
)
|
| 193 |
+
async def get_task(task_id: str) -> Task:
|
| 194 |
+
"""
|
| 195 |
+
Get details of a specific task.
|
| 196 |
+
|
| 197 |
+
Args:
|
| 198 |
+
task_id: ID of the task.
|
| 199 |
+
|
| 200 |
+
Returns:
|
| 201 |
+
Task: Task details.
|
| 202 |
+
"""
|
| 203 |
+
if task_id not in TASK_REPOSITORY:
|
| 204 |
+
raise HTTPException(
|
| 205 |
+
status_code=status.HTTP_404_NOT_FOUND,
|
| 206 |
+
detail=f"Task {task_id} not found",
|
| 207 |
+
)
|
| 208 |
+
return TASK_REPOSITORY[task_id]
|
| 209 |
+
|
| 210 |
+
|
| 211 |
+
@router.post(
|
| 212 |
+
"/",
|
| 213 |
+
response_model=Task,
|
| 214 |
+
status_code=status.HTTP_201_CREATED,
|
| 215 |
+
summary="Create a new task",
|
| 216 |
+
description="Create a new scraping task",
|
| 217 |
+
)
|
| 218 |
+
async def create_task(task: Task) -> Task:
|
| 219 |
+
"""
|
| 220 |
+
Create a new task.
|
| 221 |
+
|
| 222 |
+
Args:
|
| 223 |
+
task: Task definition.
|
| 224 |
+
|
| 225 |
+
Returns:
|
| 226 |
+
Task: Created task.
|
| 227 |
+
"""
|
| 228 |
+
if task.id in TASK_REPOSITORY:
|
| 229 |
+
raise HTTPException(
|
| 230 |
+
status_code=status.HTTP_409_CONFLICT,
|
| 231 |
+
detail=f"Task {task.id} already exists",
|
| 232 |
+
)
|
| 233 |
+
TASK_REPOSITORY[task.id] = task
|
| 234 |
+
logger.info(f"Created task {task.id}: {task.name}")
|
| 235 |
+
return task
|
| 236 |
+
|
| 237 |
+
|
| 238 |
+
@router.get(
|
| 239 |
+
"/types/",
|
| 240 |
+
status_code=status.HTTP_200_OK,
|
| 241 |
+
summary="Get task types",
|
| 242 |
+
description="Get all available task types",
|
| 243 |
+
)
|
| 244 |
+
async def get_task_types() -> dict[str, list[str]]:
|
| 245 |
+
"""
|
| 246 |
+
Get available task types and difficulties.
|
| 247 |
+
|
| 248 |
+
Returns:
|
| 249 |
+
Dict with task types and difficulties.
|
| 250 |
+
"""
|
| 251 |
+
return {
|
| 252 |
+
"task_types": [t.value for t in TaskType],
|
| 253 |
+
"difficulties": [d.value for d in TaskDifficulty],
|
| 254 |
+
}
|
backend/app/api/routes/tools.py
ADDED
|
@@ -0,0 +1,333 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tool registry and testing endpoints."""
|
| 2 |
+
|
| 3 |
+
import logging
|
| 4 |
+
from typing import Any
|
| 5 |
+
|
| 6 |
+
from fastapi import APIRouter, HTTPException, status
|
| 7 |
+
from pydantic import BaseModel, Field
|
| 8 |
+
|
| 9 |
+
router = APIRouter(prefix="/tools")
|
| 10 |
+
logger = logging.getLogger(__name__)
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class ToolParameter(BaseModel):
|
| 14 |
+
"""Parameter definition for a tool."""
|
| 15 |
+
|
| 16 |
+
name: str
|
| 17 |
+
type: str
|
| 18 |
+
description: str
|
| 19 |
+
required: bool = True
|
| 20 |
+
default: Any | None = None
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
class ToolDefinition(BaseModel):
|
| 24 |
+
"""Definition of a tool in the registry."""
|
| 25 |
+
|
| 26 |
+
name: str
|
| 27 |
+
description: str
|
| 28 |
+
category: str
|
| 29 |
+
parameters: list[ToolParameter]
|
| 30 |
+
returns: str
|
| 31 |
+
examples: list[dict[str, Any]] = Field(default_factory=list)
|
| 32 |
+
requires_browser: bool = False
|
| 33 |
+
cost_estimate: float = 0.0
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
class ToolRegistryResponse(BaseModel):
|
| 37 |
+
"""Response containing the tool registry."""
|
| 38 |
+
|
| 39 |
+
tools: list[ToolDefinition]
|
| 40 |
+
categories: list[str]
|
| 41 |
+
total_count: int
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
class ToolTestRequest(BaseModel):
|
| 45 |
+
"""Request to test a tool."""
|
| 46 |
+
|
| 47 |
+
tool_name: str
|
| 48 |
+
parameters: dict[str, Any] = Field(default_factory=dict)
|
| 49 |
+
dry_run: bool = True
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
class ToolTestResponse(BaseModel):
|
| 53 |
+
"""Response from tool testing."""
|
| 54 |
+
|
| 55 |
+
tool_name: str
|
| 56 |
+
success: bool
|
| 57 |
+
result: Any | None = None
|
| 58 |
+
error: str | None = None
|
| 59 |
+
execution_time_ms: float = 0.0
|
| 60 |
+
dry_run: bool
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
# Tool definitions (would be dynamically registered in production)
|
| 64 |
+
TOOL_DEFINITIONS: list[ToolDefinition] = [
|
| 65 |
+
ToolDefinition(
|
| 66 |
+
name="navigate_to",
|
| 67 |
+
description="Navigate the browser to a specified URL",
|
| 68 |
+
category="browser",
|
| 69 |
+
parameters=[
|
| 70 |
+
ToolParameter(name="url", type="string", description="URL to navigate to"),
|
| 71 |
+
ToolParameter(name="wait_for", type="string", description="CSS selector to wait for", required=False),
|
| 72 |
+
],
|
| 73 |
+
returns="NavigationResult with page info",
|
| 74 |
+
requires_browser=True,
|
| 75 |
+
cost_estimate=0.01,
|
| 76 |
+
),
|
| 77 |
+
ToolDefinition(
|
| 78 |
+
name="click_element",
|
| 79 |
+
description="Click on an element identified by selector",
|
| 80 |
+
category="browser",
|
| 81 |
+
parameters=[
|
| 82 |
+
ToolParameter(name="selector", type="string", description="CSS selector of element to click"),
|
| 83 |
+
],
|
| 84 |
+
returns="ClickResult with success status",
|
| 85 |
+
requires_browser=True,
|
| 86 |
+
cost_estimate=0.005,
|
| 87 |
+
),
|
| 88 |
+
ToolDefinition(
|
| 89 |
+
name="extract_text",
|
| 90 |
+
description="Extract text content from elements",
|
| 91 |
+
category="extraction",
|
| 92 |
+
parameters=[
|
| 93 |
+
ToolParameter(name="selector", type="string", description="CSS selector to extract from"),
|
| 94 |
+
ToolParameter(name="multiple", type="boolean", description="Extract from all matches", default=False),
|
| 95 |
+
],
|
| 96 |
+
returns="Extracted text or list of texts",
|
| 97 |
+
requires_browser=True,
|
| 98 |
+
cost_estimate=0.002,
|
| 99 |
+
),
|
| 100 |
+
ToolDefinition(
|
| 101 |
+
name="extract_attribute",
|
| 102 |
+
description="Extract attribute value from element",
|
| 103 |
+
category="extraction",
|
| 104 |
+
parameters=[
|
| 105 |
+
ToolParameter(name="selector", type="string", description="CSS selector"),
|
| 106 |
+
ToolParameter(name="attribute", type="string", description="Attribute name to extract"),
|
| 107 |
+
],
|
| 108 |
+
returns="Attribute value",
|
| 109 |
+
requires_browser=True,
|
| 110 |
+
cost_estimate=0.002,
|
| 111 |
+
),
|
| 112 |
+
ToolDefinition(
|
| 113 |
+
name="search_engine",
|
| 114 |
+
description="Perform a search using a search engine",
|
| 115 |
+
category="search",
|
| 116 |
+
parameters=[
|
| 117 |
+
ToolParameter(name="query", type="string", description="Search query"),
|
| 118 |
+
ToolParameter(name="engine", type="string", description="Search engine", default="google"),
|
| 119 |
+
ToolParameter(name="num_results", type="integer", description="Number of results", default=10),
|
| 120 |
+
],
|
| 121 |
+
returns="List of search results",
|
| 122 |
+
cost_estimate=0.05,
|
| 123 |
+
),
|
| 124 |
+
ToolDefinition(
|
| 125 |
+
name="fill_form",
|
| 126 |
+
description="Fill a form field with a value",
|
| 127 |
+
category="browser",
|
| 128 |
+
parameters=[
|
| 129 |
+
ToolParameter(name="selector", type="string", description="CSS selector of form field"),
|
| 130 |
+
ToolParameter(name="value", type="string", description="Value to fill"),
|
| 131 |
+
],
|
| 132 |
+
returns="FillResult with success status",
|
| 133 |
+
requires_browser=True,
|
| 134 |
+
cost_estimate=0.005,
|
| 135 |
+
),
|
| 136 |
+
ToolDefinition(
|
| 137 |
+
name="screenshot",
|
| 138 |
+
description="Take a screenshot of the current page",
|
| 139 |
+
category="browser",
|
| 140 |
+
parameters=[
|
| 141 |
+
ToolParameter(name="full_page", type="boolean", description="Capture full page", default=False),
|
| 142 |
+
],
|
| 143 |
+
returns="Base64 encoded screenshot",
|
| 144 |
+
requires_browser=True,
|
| 145 |
+
cost_estimate=0.01,
|
| 146 |
+
),
|
| 147 |
+
ToolDefinition(
|
| 148 |
+
name="get_page_html",
|
| 149 |
+
description="Get the full HTML content of the current page",
|
| 150 |
+
category="extraction",
|
| 151 |
+
parameters=[],
|
| 152 |
+
returns="HTML string",
|
| 153 |
+
requires_browser=True,
|
| 154 |
+
cost_estimate=0.001,
|
| 155 |
+
),
|
| 156 |
+
ToolDefinition(
|
| 157 |
+
name="wait_for_selector",
|
| 158 |
+
description="Wait for an element to appear on the page",
|
| 159 |
+
category="browser",
|
| 160 |
+
parameters=[
|
| 161 |
+
ToolParameter(name="selector", type="string", description="CSS selector to wait for"),
|
| 162 |
+
ToolParameter(name="timeout_ms", type="integer", description="Timeout in milliseconds", default=30000),
|
| 163 |
+
],
|
| 164 |
+
returns="Boolean indicating if element appeared",
|
| 165 |
+
requires_browser=True,
|
| 166 |
+
cost_estimate=0.001,
|
| 167 |
+
),
|
| 168 |
+
ToolDefinition(
|
| 169 |
+
name="scroll_to",
|
| 170 |
+
description="Scroll to a position or element",
|
| 171 |
+
category="browser",
|
| 172 |
+
parameters=[
|
| 173 |
+
ToolParameter(name="selector", type="string", description="CSS selector", required=False),
|
| 174 |
+
ToolParameter(name="position", type="string", description="Position: top, bottom, or pixel value", required=False),
|
| 175 |
+
],
|
| 176 |
+
returns="ScrollResult",
|
| 177 |
+
requires_browser=True,
|
| 178 |
+
cost_estimate=0.001,
|
| 179 |
+
),
|
| 180 |
+
]
|
| 181 |
+
|
| 182 |
+
|
| 183 |
+
@router.get(
|
| 184 |
+
"/registry",
|
| 185 |
+
response_model=ToolRegistryResponse,
|
| 186 |
+
status_code=status.HTTP_200_OK,
|
| 187 |
+
summary="Get tool registry",
|
| 188 |
+
description="Get all available tools in the registry",
|
| 189 |
+
)
|
| 190 |
+
async def get_tool_registry(category: str | None = None) -> ToolRegistryResponse:
|
| 191 |
+
"""
|
| 192 |
+
Get the tool registry with all available tools.
|
| 193 |
+
|
| 194 |
+
Args:
|
| 195 |
+
category: Optional filter by category.
|
| 196 |
+
|
| 197 |
+
Returns:
|
| 198 |
+
ToolRegistryResponse: List of available tools.
|
| 199 |
+
"""
|
| 200 |
+
tools = TOOL_DEFINITIONS
|
| 201 |
+
if category:
|
| 202 |
+
tools = [t for t in tools if t.category == category]
|
| 203 |
+
|
| 204 |
+
categories = list(set(t.category for t in TOOL_DEFINITIONS))
|
| 205 |
+
|
| 206 |
+
return ToolRegistryResponse(
|
| 207 |
+
tools=tools,
|
| 208 |
+
categories=categories,
|
| 209 |
+
total_count=len(tools),
|
| 210 |
+
)
|
| 211 |
+
|
| 212 |
+
|
| 213 |
+
@router.get(
|
| 214 |
+
"/registry/{tool_name}",
|
| 215 |
+
response_model=ToolDefinition,
|
| 216 |
+
status_code=status.HTTP_200_OK,
|
| 217 |
+
summary="Get tool details",
|
| 218 |
+
description="Get details of a specific tool",
|
| 219 |
+
)
|
| 220 |
+
async def get_tool_details(tool_name: str) -> ToolDefinition:
|
| 221 |
+
"""
|
| 222 |
+
Get details of a specific tool.
|
| 223 |
+
|
| 224 |
+
Args:
|
| 225 |
+
tool_name: Name of the tool.
|
| 226 |
+
|
| 227 |
+
Returns:
|
| 228 |
+
ToolDefinition: Tool details.
|
| 229 |
+
"""
|
| 230 |
+
for tool in TOOL_DEFINITIONS:
|
| 231 |
+
if tool.name == tool_name:
|
| 232 |
+
return tool
|
| 233 |
+
raise HTTPException(
|
| 234 |
+
status_code=status.HTTP_404_NOT_FOUND,
|
| 235 |
+
detail=f"Tool '{tool_name}' not found",
|
| 236 |
+
)
|
| 237 |
+
|
| 238 |
+
|
| 239 |
+
@router.post(
|
| 240 |
+
"/test",
|
| 241 |
+
response_model=ToolTestResponse,
|
| 242 |
+
status_code=status.HTTP_200_OK,
|
| 243 |
+
summary="Test a tool",
|
| 244 |
+
description="Test a tool with provided parameters",
|
| 245 |
+
)
|
| 246 |
+
async def test_tool(request: ToolTestRequest) -> ToolTestResponse:
|
| 247 |
+
"""
|
| 248 |
+
Test a tool execution.
|
| 249 |
+
|
| 250 |
+
Args:
|
| 251 |
+
request: Tool test request.
|
| 252 |
+
|
| 253 |
+
Returns:
|
| 254 |
+
ToolTestResponse: Result of tool test.
|
| 255 |
+
"""
|
| 256 |
+
import time
|
| 257 |
+
|
| 258 |
+
start_time = time.time()
|
| 259 |
+
logger.info(f"Testing tool '{request.tool_name}' with dry_run={request.dry_run}")
|
| 260 |
+
|
| 261 |
+
# Find the tool
|
| 262 |
+
tool = None
|
| 263 |
+
for t in TOOL_DEFINITIONS:
|
| 264 |
+
if t.name == request.tool_name:
|
| 265 |
+
tool = t
|
| 266 |
+
break
|
| 267 |
+
|
| 268 |
+
if not tool:
|
| 269 |
+
raise HTTPException(
|
| 270 |
+
status_code=status.HTTP_404_NOT_FOUND,
|
| 271 |
+
detail=f"Tool '{request.tool_name}' not found",
|
| 272 |
+
)
|
| 273 |
+
|
| 274 |
+
try:
|
| 275 |
+
# Validate required parameters
|
| 276 |
+
for param in tool.parameters:
|
| 277 |
+
if param.required and param.name not in request.parameters:
|
| 278 |
+
raise ValueError(f"Missing required parameter: {param.name}")
|
| 279 |
+
|
| 280 |
+
if request.dry_run:
|
| 281 |
+
# Return mock result for dry run
|
| 282 |
+
result = {
|
| 283 |
+
"status": "dry_run",
|
| 284 |
+
"tool": request.tool_name,
|
| 285 |
+
"parameters": request.parameters,
|
| 286 |
+
"would_require_browser": tool.requires_browser,
|
| 287 |
+
}
|
| 288 |
+
else:
|
| 289 |
+
# Actually execute the tool (placeholder)
|
| 290 |
+
from app.tools.registry import MCPToolRegistry
|
| 291 |
+
registry = MCPToolRegistry()
|
| 292 |
+
result = await registry.execute_tool(request.tool_name, request.parameters)
|
| 293 |
+
|
| 294 |
+
execution_time = (time.time() - start_time) * 1000
|
| 295 |
+
|
| 296 |
+
return ToolTestResponse(
|
| 297 |
+
tool_name=request.tool_name,
|
| 298 |
+
success=True,
|
| 299 |
+
result=result,
|
| 300 |
+
execution_time_ms=execution_time,
|
| 301 |
+
dry_run=request.dry_run,
|
| 302 |
+
)
|
| 303 |
+
except Exception as e:
|
| 304 |
+
execution_time = (time.time() - start_time) * 1000
|
| 305 |
+
logger.error(f"Tool test failed: {e}")
|
| 306 |
+
return ToolTestResponse(
|
| 307 |
+
tool_name=request.tool_name,
|
| 308 |
+
success=False,
|
| 309 |
+
error=str(e),
|
| 310 |
+
execution_time_ms=execution_time,
|
| 311 |
+
dry_run=request.dry_run,
|
| 312 |
+
)
|
| 313 |
+
|
| 314 |
+
|
| 315 |
+
@router.get(
|
| 316 |
+
"/categories",
|
| 317 |
+
status_code=status.HTTP_200_OK,
|
| 318 |
+
summary="Get tool categories",
|
| 319 |
+
description="Get all tool categories",
|
| 320 |
+
)
|
| 321 |
+
async def get_categories() -> dict[str, list[str]]:
|
| 322 |
+
"""
|
| 323 |
+
Get all tool categories.
|
| 324 |
+
|
| 325 |
+
Returns:
|
| 326 |
+
Dict with category information.
|
| 327 |
+
"""
|
| 328 |
+
categories = {}
|
| 329 |
+
for tool in TOOL_DEFINITIONS:
|
| 330 |
+
if tool.category not in categories:
|
| 331 |
+
categories[tool.category] = []
|
| 332 |
+
categories[tool.category].append(tool.name)
|
| 333 |
+
return {"categories": categories}
|
backend/app/utils/__init__.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Utility modules for ScrapeRL backend."""
|
| 2 |
+
|
| 3 |
+
from app.utils.html import (
|
| 4 |
+
parse_html,
|
| 5 |
+
clean_html,
|
| 6 |
+
extract_text,
|
| 7 |
+
semantic_chunk,
|
| 8 |
+
extract_links,
|
| 9 |
+
extract_tables,
|
| 10 |
+
)
|
| 11 |
+
from app.utils.logging import setup_logging, get_logger
|
| 12 |
+
|
| 13 |
+
__all__ = [
|
| 14 |
+
"parse_html",
|
| 15 |
+
"clean_html",
|
| 16 |
+
"extract_text",
|
| 17 |
+
"semantic_chunk",
|
| 18 |
+
"extract_links",
|
| 19 |
+
"extract_tables",
|
| 20 |
+
"setup_logging",
|
| 21 |
+
"get_logger",
|
| 22 |
+
]
|
backend/app/utils/__pycache__/__init__.cpython-314.pyc
ADDED
|
Binary file (492 Bytes). View file
|
|
|
backend/app/utils/__pycache__/html.cpython-314.pyc
ADDED
|
Binary file (10.7 kB). View file
|
|
|
backend/app/utils/__pycache__/logging.cpython-314.pyc
ADDED
|
Binary file (2.54 kB). View file
|
|
|
backend/app/utils/html.py
ADDED
|
@@ -0,0 +1,286 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""HTML processing utilities for ScrapeRL backend."""
|
| 2 |
+
|
| 3 |
+
import re
|
| 4 |
+
from typing import Any, Optional
|
| 5 |
+
from bs4 import BeautifulSoup, Tag, NavigableString
|
| 6 |
+
|
| 7 |
+
from app.utils.logging import get_logger
|
| 8 |
+
|
| 9 |
+
logger = get_logger(__name__)
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def parse_html(html: str, parser: str = "html.parser") -> BeautifulSoup:
|
| 13 |
+
"""
|
| 14 |
+
Parse HTML string into a BeautifulSoup object.
|
| 15 |
+
|
| 16 |
+
Args:
|
| 17 |
+
html: Raw HTML string
|
| 18 |
+
parser: Parser to use (html.parser, lxml, html5lib)
|
| 19 |
+
|
| 20 |
+
Returns:
|
| 21 |
+
Parsed BeautifulSoup object
|
| 22 |
+
"""
|
| 23 |
+
return BeautifulSoup(html, parser)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def clean_html(
|
| 27 |
+
html: str,
|
| 28 |
+
remove_scripts: bool = True,
|
| 29 |
+
remove_styles: bool = True,
|
| 30 |
+
remove_comments: bool = True,
|
| 31 |
+
remove_tags: Optional[list[str]] = None,
|
| 32 |
+
) -> str:
|
| 33 |
+
"""
|
| 34 |
+
Clean HTML by removing unwanted elements.
|
| 35 |
+
|
| 36 |
+
Args:
|
| 37 |
+
html: Raw HTML string
|
| 38 |
+
remove_scripts: Remove <script> tags
|
| 39 |
+
remove_styles: Remove <style> tags
|
| 40 |
+
remove_comments: Remove HTML comments
|
| 41 |
+
remove_tags: Additional tags to remove
|
| 42 |
+
|
| 43 |
+
Returns:
|
| 44 |
+
Cleaned HTML string
|
| 45 |
+
"""
|
| 46 |
+
soup = parse_html(html)
|
| 47 |
+
|
| 48 |
+
# Remove script tags
|
| 49 |
+
if remove_scripts:
|
| 50 |
+
for script in soup.find_all("script"):
|
| 51 |
+
script.decompose()
|
| 52 |
+
|
| 53 |
+
# Remove style tags
|
| 54 |
+
if remove_styles:
|
| 55 |
+
for style in soup.find_all("style"):
|
| 56 |
+
style.decompose()
|
| 57 |
+
|
| 58 |
+
# Remove comments
|
| 59 |
+
if remove_comments:
|
| 60 |
+
from bs4 import Comment
|
| 61 |
+
|
| 62 |
+
for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
|
| 63 |
+
comment.extract()
|
| 64 |
+
|
| 65 |
+
# Remove additional specified tags
|
| 66 |
+
if remove_tags:
|
| 67 |
+
for tag_name in remove_tags:
|
| 68 |
+
for tag in soup.find_all(tag_name):
|
| 69 |
+
tag.decompose()
|
| 70 |
+
|
| 71 |
+
return str(soup)
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
def extract_text(
|
| 75 |
+
html: str,
|
| 76 |
+
separator: str = " ",
|
| 77 |
+
strip: bool = True,
|
| 78 |
+
) -> str:
|
| 79 |
+
"""
|
| 80 |
+
Extract plain text from HTML.
|
| 81 |
+
|
| 82 |
+
Args:
|
| 83 |
+
html: Raw HTML string
|
| 84 |
+
separator: String to join text segments
|
| 85 |
+
strip: Strip whitespace from result
|
| 86 |
+
|
| 87 |
+
Returns:
|
| 88 |
+
Extracted plain text
|
| 89 |
+
"""
|
| 90 |
+
soup = parse_html(html)
|
| 91 |
+
|
| 92 |
+
# Remove script and style elements
|
| 93 |
+
for element in soup(["script", "style", "noscript"]):
|
| 94 |
+
element.decompose()
|
| 95 |
+
|
| 96 |
+
text = soup.get_text(separator=separator)
|
| 97 |
+
|
| 98 |
+
if strip:
|
| 99 |
+
# Normalize whitespace
|
| 100 |
+
text = re.sub(r"\s+", " ", text).strip()
|
| 101 |
+
|
| 102 |
+
return text
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
def semantic_chunk(
|
| 106 |
+
html: str,
|
| 107 |
+
max_chunk_size: int = 4000,
|
| 108 |
+
overlap: int = 200,
|
| 109 |
+
) -> list[dict[str, Any]]:
|
| 110 |
+
"""
|
| 111 |
+
Split HTML content into semantic chunks based on structure.
|
| 112 |
+
|
| 113 |
+
Args:
|
| 114 |
+
html: Raw HTML string
|
| 115 |
+
max_chunk_size: Maximum characters per chunk
|
| 116 |
+
overlap: Number of characters to overlap between chunks
|
| 117 |
+
|
| 118 |
+
Returns:
|
| 119 |
+
List of chunk dictionaries with text and metadata
|
| 120 |
+
"""
|
| 121 |
+
soup = parse_html(html)
|
| 122 |
+
chunks: list[dict[str, Any]] = []
|
| 123 |
+
|
| 124 |
+
# Remove non-content elements
|
| 125 |
+
for element in soup(["script", "style", "noscript", "nav", "footer", "header"]):
|
| 126 |
+
element.decompose()
|
| 127 |
+
|
| 128 |
+
# Find semantic boundaries
|
| 129 |
+
semantic_tags = ["article", "section", "div", "p", "h1", "h2", "h3", "h4", "h5", "h6"]
|
| 130 |
+
|
| 131 |
+
def get_text_content(element: Tag | NavigableString) -> str:
|
| 132 |
+
if isinstance(element, NavigableString):
|
| 133 |
+
return str(element).strip()
|
| 134 |
+
return element.get_text(separator=" ", strip=True)
|
| 135 |
+
|
| 136 |
+
current_chunk = ""
|
| 137 |
+
current_metadata: dict[str, Any] = {"tags": [], "headings": []}
|
| 138 |
+
|
| 139 |
+
for element in soup.find_all(semantic_tags):
|
| 140 |
+
text = get_text_content(element)
|
| 141 |
+
if not text:
|
| 142 |
+
continue
|
| 143 |
+
|
| 144 |
+
tag_name = element.name if isinstance(element, Tag) else "text"
|
| 145 |
+
|
| 146 |
+
# Check if adding this would exceed max size
|
| 147 |
+
if len(current_chunk) + len(text) + 1 > max_chunk_size:
|
| 148 |
+
if current_chunk:
|
| 149 |
+
chunks.append({
|
| 150 |
+
"text": current_chunk.strip(),
|
| 151 |
+
"metadata": current_metadata.copy(),
|
| 152 |
+
"char_count": len(current_chunk),
|
| 153 |
+
})
|
| 154 |
+
# Start new chunk with overlap
|
| 155 |
+
if overlap > 0 and current_chunk:
|
| 156 |
+
current_chunk = current_chunk[-overlap:] + " " + text
|
| 157 |
+
else:
|
| 158 |
+
current_chunk = text
|
| 159 |
+
current_metadata = {"tags": [tag_name], "headings": []}
|
| 160 |
+
else:
|
| 161 |
+
current_chunk += " " + text if current_chunk else text
|
| 162 |
+
current_metadata["tags"].append(tag_name)
|
| 163 |
+
|
| 164 |
+
# Track headings
|
| 165 |
+
if tag_name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
|
| 166 |
+
current_metadata["headings"].append(text[:100])
|
| 167 |
+
|
| 168 |
+
# Add remaining content
|
| 169 |
+
if current_chunk.strip():
|
| 170 |
+
chunks.append({
|
| 171 |
+
"text": current_chunk.strip(),
|
| 172 |
+
"metadata": current_metadata,
|
| 173 |
+
"char_count": len(current_chunk),
|
| 174 |
+
})
|
| 175 |
+
|
| 176 |
+
# If no semantic chunks found, fall back to simple chunking
|
| 177 |
+
if not chunks:
|
| 178 |
+
text = extract_text(html)
|
| 179 |
+
for i in range(0, len(text), max_chunk_size - overlap):
|
| 180 |
+
chunk_text = text[i : i + max_chunk_size]
|
| 181 |
+
if chunk_text.strip():
|
| 182 |
+
chunks.append({
|
| 183 |
+
"text": chunk_text.strip(),
|
| 184 |
+
"metadata": {"tags": [], "headings": []},
|
| 185 |
+
"char_count": len(chunk_text),
|
| 186 |
+
})
|
| 187 |
+
|
| 188 |
+
return chunks
|
| 189 |
+
|
| 190 |
+
|
| 191 |
+
def extract_links(
|
| 192 |
+
html: str,
|
| 193 |
+
base_url: Optional[str] = None,
|
| 194 |
+
include_text: bool = True,
|
| 195 |
+
) -> list[dict[str, str]]:
|
| 196 |
+
"""
|
| 197 |
+
Extract all links from HTML.
|
| 198 |
+
|
| 199 |
+
Args:
|
| 200 |
+
html: Raw HTML string
|
| 201 |
+
base_url: Base URL for resolving relative links
|
| 202 |
+
include_text: Include link text in results
|
| 203 |
+
|
| 204 |
+
Returns:
|
| 205 |
+
List of link dictionaries with href and optionally text
|
| 206 |
+
"""
|
| 207 |
+
from urllib.parse import urljoin
|
| 208 |
+
|
| 209 |
+
soup = parse_html(html)
|
| 210 |
+
links: list[dict[str, str]] = []
|
| 211 |
+
|
| 212 |
+
for anchor in soup.find_all("a", href=True):
|
| 213 |
+
href = anchor.get("href", "")
|
| 214 |
+
if not href or href.startswith("#") or href.startswith("javascript:"):
|
| 215 |
+
continue
|
| 216 |
+
|
| 217 |
+
# Resolve relative URLs
|
| 218 |
+
if base_url and not href.startswith(("http://", "https://", "//")):
|
| 219 |
+
href = urljoin(base_url, href)
|
| 220 |
+
|
| 221 |
+
link_data: dict[str, str] = {"href": href}
|
| 222 |
+
|
| 223 |
+
if include_text:
|
| 224 |
+
link_data["text"] = anchor.get_text(strip=True)
|
| 225 |
+
|
| 226 |
+
# Include title if present
|
| 227 |
+
title = anchor.get("title")
|
| 228 |
+
if title:
|
| 229 |
+
link_data["title"] = title
|
| 230 |
+
|
| 231 |
+
links.append(link_data)
|
| 232 |
+
|
| 233 |
+
return links
|
| 234 |
+
|
| 235 |
+
|
| 236 |
+
def extract_tables(
|
| 237 |
+
html: str,
|
| 238 |
+
include_headers: bool = True,
|
| 239 |
+
) -> list[dict[str, Any]]:
|
| 240 |
+
"""
|
| 241 |
+
Extract tables from HTML as structured data.
|
| 242 |
+
|
| 243 |
+
Args:
|
| 244 |
+
html: Raw HTML string
|
| 245 |
+
include_headers: Try to identify and include header rows
|
| 246 |
+
|
| 247 |
+
Returns:
|
| 248 |
+
List of table dictionaries with headers and rows
|
| 249 |
+
"""
|
| 250 |
+
soup = parse_html(html)
|
| 251 |
+
tables: list[dict[str, Any]] = []
|
| 252 |
+
|
| 253 |
+
for table in soup.find_all("table"):
|
| 254 |
+
table_data: dict[str, Any] = {
|
| 255 |
+
"headers": [],
|
| 256 |
+
"rows": [],
|
| 257 |
+
}
|
| 258 |
+
|
| 259 |
+
# Extract headers from thead or first row
|
| 260 |
+
if include_headers:
|
| 261 |
+
thead = table.find("thead")
|
| 262 |
+
if thead:
|
| 263 |
+
header_row = thead.find("tr")
|
| 264 |
+
if header_row:
|
| 265 |
+
table_data["headers"] = [
|
| 266 |
+
th.get_text(strip=True)
|
| 267 |
+
for th in header_row.find_all(["th", "td"])
|
| 268 |
+
]
|
| 269 |
+
|
| 270 |
+
# Extract body rows
|
| 271 |
+
tbody = table.find("tbody") or table
|
| 272 |
+
for row in tbody.find_all("tr"):
|
| 273 |
+
cells = row.find_all(["td", "th"])
|
| 274 |
+
row_data = [cell.get_text(strip=True) for cell in cells]
|
| 275 |
+
|
| 276 |
+
# If no headers yet and this looks like a header row
|
| 277 |
+
if include_headers and not table_data["headers"] and row.find("th"):
|
| 278 |
+
table_data["headers"] = row_data
|
| 279 |
+
else:
|
| 280 |
+
if row_data: # Skip empty rows
|
| 281 |
+
table_data["rows"].append(row_data)
|
| 282 |
+
|
| 283 |
+
if table_data["rows"] or table_data["headers"]:
|
| 284 |
+
tables.append(table_data)
|
| 285 |
+
|
| 286 |
+
return tables
|
backend/app/utils/logging.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Logging utilities for ScrapeRL backend."""
|
| 2 |
+
|
| 3 |
+
import logging
|
| 4 |
+
import sys
|
| 5 |
+
from typing import Optional
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def setup_logging(
|
| 9 |
+
level: str = "INFO",
|
| 10 |
+
format_string: Optional[str] = None,
|
| 11 |
+
log_file: Optional[str] = None,
|
| 12 |
+
) -> None:
|
| 13 |
+
"""
|
| 14 |
+
Configure logging for the application.
|
| 15 |
+
|
| 16 |
+
Args:
|
| 17 |
+
level: Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
|
| 18 |
+
format_string: Custom format string for log messages
|
| 19 |
+
log_file: Optional file path to write logs to
|
| 20 |
+
"""
|
| 21 |
+
if format_string is None:
|
| 22 |
+
format_string = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
| 23 |
+
|
| 24 |
+
log_level = getattr(logging, level.upper(), logging.INFO)
|
| 25 |
+
|
| 26 |
+
handlers: list[logging.Handler] = [logging.StreamHandler(sys.stdout)]
|
| 27 |
+
|
| 28 |
+
if log_file:
|
| 29 |
+
file_handler = logging.FileHandler(log_file)
|
| 30 |
+
handlers.append(file_handler)
|
| 31 |
+
|
| 32 |
+
logging.basicConfig(
|
| 33 |
+
level=log_level,
|
| 34 |
+
format=format_string,
|
| 35 |
+
handlers=handlers,
|
| 36 |
+
)
|
| 37 |
+
|
| 38 |
+
# Reduce noise from third-party libraries
|
| 39 |
+
logging.getLogger("httpx").setLevel(logging.WARNING)
|
| 40 |
+
logging.getLogger("httpcore").setLevel(logging.WARNING)
|
| 41 |
+
logging.getLogger("urllib3").setLevel(logging.WARNING)
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def get_logger(name: str) -> logging.Logger:
|
| 45 |
+
"""
|
| 46 |
+
Get a logger instance with the specified name.
|
| 47 |
+
|
| 48 |
+
Args:
|
| 49 |
+
name: Logger name, typically __name__ of the calling module
|
| 50 |
+
|
| 51 |
+
Returns:
|
| 52 |
+
Configured logger instance
|
| 53 |
+
"""
|
| 54 |
+
return logging.getLogger(name)
|