NeerajCodz commited on
Commit
27cde0c
·
1 Parent(s): afefaea

feat: add API routes and utility modules

Browse files
backend/app/api/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ """API package initialization."""
2
+
3
+ from app.api.routes import agents, episode, health, memory, tasks, tools
4
+
5
+ __all__ = ["agents", "episode", "health", "memory", "tasks", "tools"]
backend/app/api/__pycache__/__init__.cpython-314.pyc ADDED
Binary file (349 Bytes). View file
 
backend/app/api/__pycache__/deps.cpython-314.pyc ADDED
Binary file (7.7 kB). View file
 
backend/app/api/deps.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Dependency injection utilities for API routes."""
2
+
3
+ from typing import Annotated, Generator
4
+
5
+ from fastapi import Depends, HTTPException, status
6
+
7
+ from app.config import Settings, get_settings
8
+ from app.core.env import WebScraperEnv
9
+ from app.memory.manager import MemoryManager
10
+ from app.models.router import SmartModelRouter
11
+ from app.tools.registry import MCPToolRegistry
12
+
13
+
14
+ # Settings dependency
15
+ SettingsDep = Annotated[Settings, Depends(get_settings)]
16
+
17
+
18
+ # Store for active environments
19
+ _active_environments: dict[str, WebScraperEnv] = {}
20
+
21
+
22
+ def get_environment(episode_id: str) -> WebScraperEnv:
23
+ """Get an active environment by episode ID."""
24
+ if episode_id not in _active_environments:
25
+ raise HTTPException(
26
+ status_code=status.HTTP_404_NOT_FOUND,
27
+ detail=f"Episode {episode_id} not found",
28
+ )
29
+ return _active_environments[episode_id]
30
+
31
+
32
+ def create_environment(episode_id: str, settings: Settings) -> WebScraperEnv:
33
+ """Create a new environment for an episode."""
34
+ if episode_id in _active_environments:
35
+ raise HTTPException(
36
+ status_code=status.HTTP_409_CONFLICT,
37
+ detail=f"Episode {episode_id} already exists",
38
+ )
39
+ env = WebScraperEnv(episode_id=episode_id, settings=settings)
40
+ _active_environments[episode_id] = env
41
+ return env
42
+
43
+
44
+ def remove_environment(episode_id: str) -> bool:
45
+ """Remove an environment from active storage."""
46
+ if episode_id in _active_environments:
47
+ del _active_environments[episode_id]
48
+ return True
49
+ return False
50
+
51
+
52
+ def list_environments() -> list[str]:
53
+ """List all active episode IDs."""
54
+ return list(_active_environments.keys())
55
+
56
+
57
+ class DependencyContainer:
58
+ """Container for dependency injection across the application."""
59
+
60
+ def __init__(self) -> None:
61
+ self._memory_manager: MemoryManager | None = None
62
+ self._model_router: SmartModelRouter | None = None
63
+ self._tool_registry: MCPToolRegistry | None = None
64
+
65
+ def set_memory_manager(self, manager: MemoryManager) -> None:
66
+ """Set the memory manager instance."""
67
+ self._memory_manager = manager
68
+
69
+ def set_model_router(self, router: SmartModelRouter) -> None:
70
+ """Set the model router instance."""
71
+ self._model_router = router
72
+
73
+ def set_tool_registry(self, registry: MCPToolRegistry) -> None:
74
+ """Set the tool registry instance."""
75
+ self._tool_registry = registry
76
+
77
+ def get_memory_manager(self) -> MemoryManager:
78
+ """Get the memory manager instance."""
79
+ if self._memory_manager is None:
80
+ raise RuntimeError("Memory manager not initialized")
81
+ return self._memory_manager
82
+
83
+ def get_model_router(self) -> SmartModelRouter:
84
+ """Get the model router instance."""
85
+ if self._model_router is None:
86
+ raise RuntimeError("Model router not initialized")
87
+ return self._model_router
88
+
89
+ def get_tool_registry(self) -> MCPToolRegistry:
90
+ """Get the tool registry instance."""
91
+ if self._tool_registry is None:
92
+ raise RuntimeError("Tool registry not initialized")
93
+ return self._tool_registry
94
+
95
+
96
+ # Global container instance
97
+ container = DependencyContainer()
98
+
99
+
100
+ def get_memory_manager() -> MemoryManager:
101
+ """Dependency for memory manager."""
102
+ return container.get_memory_manager()
103
+
104
+
105
+ def get_model_router() -> SmartModelRouter:
106
+ """Dependency for model router."""
107
+ return container.get_model_router()
108
+
109
+
110
+ def get_tool_registry() -> MCPToolRegistry:
111
+ """Dependency for tool registry."""
112
+ return container.get_tool_registry()
113
+
114
+
115
+ # Type aliases for dependency injection
116
+ MemoryManagerDep = Annotated[MemoryManager, Depends(get_memory_manager)]
117
+ ModelRouterDep = Annotated[SmartModelRouter, Depends(get_model_router)]
118
+ ToolRegistryDep = Annotated[MCPToolRegistry, Depends(get_tool_registry)]
backend/app/api/routes/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ """API routes package."""
2
+
3
+ from app.api.routes import agents, episode, health, memory, tasks, tools
4
+
5
+ __all__ = ["agents", "episode", "health", "memory", "tasks", "tools"]
backend/app/api/routes/__pycache__/__init__.cpython-314.pyc ADDED
Binary file (348 Bytes). View file
 
backend/app/api/routes/__pycache__/agents.cpython-314.pyc ADDED
Binary file (14.7 kB). View file
 
backend/app/api/routes/__pycache__/episode.cpython-314.pyc ADDED
Binary file (11.2 kB). View file
 
backend/app/api/routes/__pycache__/health.cpython-314.pyc ADDED
Binary file (6.35 kB). View file
 
backend/app/api/routes/__pycache__/memory.cpython-314.pyc ADDED
Binary file (16.5 kB). View file
 
backend/app/api/routes/__pycache__/tasks.cpython-314.pyc ADDED
Binary file (11.3 kB). View file
 
backend/app/api/routes/__pycache__/tools.cpython-314.pyc ADDED
Binary file (13.1 kB). View file
 
backend/app/api/routes/agents.py ADDED
@@ -0,0 +1,301 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Agent management endpoints."""
2
+
3
+ import logging
4
+ from enum import Enum
5
+ from typing import Any
6
+ from uuid import uuid4
7
+
8
+ from fastapi import APIRouter, HTTPException, status
9
+ from pydantic import BaseModel, Field
10
+
11
+ router = APIRouter(prefix="/agents")
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class AgentType(str, Enum):
16
+ """Types of agents in the system."""
17
+
18
+ PLANNER = "planner"
19
+ NAVIGATOR = "navigator"
20
+ EXTRACTOR = "extractor"
21
+ VERIFIER = "verifier"
22
+ MEMORY = "memory"
23
+ COORDINATOR = "coordinator"
24
+
25
+
26
+ class AgentStatus(str, Enum):
27
+ """Agent execution status."""
28
+
29
+ IDLE = "idle"
30
+ PLANNING = "planning"
31
+ EXECUTING = "executing"
32
+ WAITING = "waiting"
33
+ COMPLETED = "completed"
34
+ FAILED = "failed"
35
+
36
+
37
+ class AgentRunRequest(BaseModel):
38
+ """Request to run an agent."""
39
+
40
+ agent_type: AgentType
41
+ episode_id: str
42
+ task_context: dict[str, Any] = Field(default_factory=dict)
43
+ observation: dict[str, Any] | None = None
44
+ config: dict[str, Any] = Field(default_factory=dict)
45
+
46
+
47
+ class AgentRunResponse(BaseModel):
48
+ """Response from agent execution."""
49
+
50
+ run_id: str
51
+ agent_type: AgentType
52
+ status: AgentStatus
53
+ action: dict[str, Any] | None = None
54
+ reasoning: str | None = None
55
+ confidence: float | None = None
56
+ tokens_used: int = 0
57
+ execution_time_ms: float = 0.0
58
+
59
+
60
+ class PlanRequest(BaseModel):
61
+ """Request for creating a plan."""
62
+
63
+ episode_id: str
64
+ task_description: str
65
+ current_state: dict[str, Any] = Field(default_factory=dict)
66
+ constraints: list[str] = Field(default_factory=list)
67
+
68
+
69
+ class PlanStep(BaseModel):
70
+ """A single step in a plan."""
71
+
72
+ step_number: int
73
+ action_type: str
74
+ description: str
75
+ agent: AgentType
76
+ dependencies: list[int] = Field(default_factory=list)
77
+ estimated_cost: float = 0.0
78
+
79
+
80
+ class PlanResponse(BaseModel):
81
+ """Response containing a generated plan."""
82
+
83
+ plan_id: str
84
+ episode_id: str
85
+ steps: list[PlanStep]
86
+ total_estimated_steps: int
87
+ reasoning: str
88
+ confidence: float
89
+
90
+
91
+ class AgentState(BaseModel):
92
+ """Current state of an agent."""
93
+
94
+ agent_id: str
95
+ agent_type: AgentType
96
+ status: AgentStatus
97
+ current_task: str | None = None
98
+ messages_processed: int = 0
99
+ actions_taken: int = 0
100
+ last_action: dict[str, Any] | None = None
101
+ memory_snapshot: dict[str, Any] = Field(default_factory=dict)
102
+
103
+
104
+ # Store for agent states
105
+ _agent_states: dict[str, AgentState] = {}
106
+
107
+
108
+ @router.post(
109
+ "/run",
110
+ response_model=AgentRunResponse,
111
+ status_code=status.HTTP_200_OK,
112
+ summary="Run an agent",
113
+ description="Execute an agent to produce an action",
114
+ )
115
+ async def run_agent(request: AgentRunRequest) -> AgentRunResponse:
116
+ """
117
+ Run an agent to produce an action for the current observation.
118
+
119
+ Args:
120
+ request: Agent run configuration.
121
+
122
+ Returns:
123
+ AgentRunResponse: Result of agent execution.
124
+ """
125
+ run_id = str(uuid4())
126
+ logger.info(f"Running {request.agent_type} agent for episode {request.episode_id}")
127
+
128
+ try:
129
+ # Import and instantiate the appropriate agent
130
+ from app.agents.coordinator import AgentCoordinator
131
+
132
+ coordinator = AgentCoordinator()
133
+ result = await coordinator.run_agent(
134
+ agent_type=request.agent_type,
135
+ episode_id=request.episode_id,
136
+ observation=request.observation,
137
+ config=request.config,
138
+ )
139
+
140
+ return AgentRunResponse(
141
+ run_id=run_id,
142
+ agent_type=request.agent_type,
143
+ status=AgentStatus.COMPLETED,
144
+ action=result.get("action"),
145
+ reasoning=result.get("reasoning"),
146
+ confidence=result.get("confidence"),
147
+ tokens_used=result.get("tokens_used", 0),
148
+ execution_time_ms=result.get("execution_time_ms", 0.0),
149
+ )
150
+ except Exception as e:
151
+ logger.error(f"Agent execution failed: {e}")
152
+ return AgentRunResponse(
153
+ run_id=run_id,
154
+ agent_type=request.agent_type,
155
+ status=AgentStatus.FAILED,
156
+ reasoning=str(e),
157
+ )
158
+
159
+
160
+ @router.post(
161
+ "/plan",
162
+ response_model=PlanResponse,
163
+ status_code=status.HTTP_200_OK,
164
+ summary="Generate a plan",
165
+ description="Use the planner agent to generate an execution plan",
166
+ )
167
+ async def generate_plan(request: PlanRequest) -> PlanResponse:
168
+ """
169
+ Generate a plan for completing a task.
170
+
171
+ Args:
172
+ request: Planning request with task details.
173
+
174
+ Returns:
175
+ PlanResponse: Generated plan with steps.
176
+ """
177
+ plan_id = str(uuid4())
178
+ logger.info(f"Generating plan for episode {request.episode_id}")
179
+
180
+ try:
181
+ from app.agents.planner import PlannerAgent
182
+
183
+ planner = PlannerAgent()
184
+ plan_result = await planner.create_plan(
185
+ task_description=request.task_description,
186
+ current_state=request.current_state,
187
+ constraints=request.constraints,
188
+ )
189
+
190
+ steps = [
191
+ PlanStep(
192
+ step_number=i + 1,
193
+ action_type=step["action_type"],
194
+ description=step["description"],
195
+ agent=AgentType(step["agent"]),
196
+ dependencies=step.get("dependencies", []),
197
+ estimated_cost=step.get("estimated_cost", 0.0),
198
+ )
199
+ for i, step in enumerate(plan_result["steps"])
200
+ ]
201
+
202
+ return PlanResponse(
203
+ plan_id=plan_id,
204
+ episode_id=request.episode_id,
205
+ steps=steps,
206
+ total_estimated_steps=len(steps),
207
+ reasoning=plan_result.get("reasoning", ""),
208
+ confidence=plan_result.get("confidence", 0.8),
209
+ )
210
+ except Exception as e:
211
+ logger.error(f"Plan generation failed: {e}")
212
+ raise HTTPException(
213
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
214
+ detail=f"Failed to generate plan: {str(e)}",
215
+ )
216
+
217
+
218
+ @router.get(
219
+ "/state/{agent_id}",
220
+ response_model=AgentState,
221
+ status_code=status.HTTP_200_OK,
222
+ summary="Get agent state",
223
+ description="Get the current state of an agent",
224
+ )
225
+ async def get_agent_state(agent_id: str) -> AgentState:
226
+ """
227
+ Get the current state of an agent.
228
+
229
+ Args:
230
+ agent_id: ID of the agent.
231
+
232
+ Returns:
233
+ AgentState: Current agent state.
234
+ """
235
+ if agent_id not in _agent_states:
236
+ raise HTTPException(
237
+ status_code=status.HTTP_404_NOT_FOUND,
238
+ detail=f"Agent {agent_id} not found",
239
+ )
240
+ return _agent_states[agent_id]
241
+
242
+
243
+ @router.get(
244
+ "/types/",
245
+ status_code=status.HTTP_200_OK,
246
+ summary="Get agent types",
247
+ description="Get all available agent types",
248
+ )
249
+ async def get_agent_types() -> dict[str, list[dict[str, str]]]:
250
+ """
251
+ Get available agent types with descriptions.
252
+
253
+ Returns:
254
+ Dict with agent type information.
255
+ """
256
+ agent_info = [
257
+ {"type": AgentType.PLANNER.value, "description": "Creates execution plans for tasks"},
258
+ {"type": AgentType.NAVIGATOR.value, "description": "Handles page navigation and URL management"},
259
+ {"type": AgentType.EXTRACTOR.value, "description": "Extracts data from web pages"},
260
+ {"type": AgentType.VERIFIER.value, "description": "Validates extracted data"},
261
+ {"type": AgentType.MEMORY.value, "description": "Manages memory operations"},
262
+ {"type": AgentType.COORDINATOR.value, "description": "Orchestrates multi-agent collaboration"},
263
+ ]
264
+ return {"agents": agent_info}
265
+
266
+
267
+ @router.post(
268
+ "/message",
269
+ status_code=status.HTTP_200_OK,
270
+ summary="Send inter-agent message",
271
+ description="Send a message between agents",
272
+ )
273
+ async def send_agent_message(
274
+ from_agent: str,
275
+ to_agent: str,
276
+ message_type: str,
277
+ content: dict[str, Any],
278
+ ) -> dict[str, Any]:
279
+ """
280
+ Send a message between agents.
281
+
282
+ Args:
283
+ from_agent: Source agent ID.
284
+ to_agent: Target agent ID.
285
+ message_type: Type of message.
286
+ content: Message content.
287
+
288
+ Returns:
289
+ Acknowledgment of message delivery.
290
+ """
291
+ message_id = str(uuid4())
292
+ logger.info(f"Message {message_id}: {from_agent} -> {to_agent} ({message_type})")
293
+
294
+ # In production, this would go through a message broker
295
+ return {
296
+ "message_id": message_id,
297
+ "status": "delivered",
298
+ "from": from_agent,
299
+ "to": to_agent,
300
+ "type": message_type,
301
+ }
backend/app/api/routes/episode.py ADDED
@@ -0,0 +1,235 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Episode management endpoints - reset, step, and state operations."""
2
+
3
+ import logging
4
+ from typing import Any
5
+ from uuid import uuid4
6
+
7
+ from fastapi import APIRouter, HTTPException, status
8
+ from pydantic import BaseModel, Field
9
+
10
+ from app.api.deps import SettingsDep, create_environment, get_environment, remove_environment, list_environments
11
+ from app.core.action import Action, ActionType
12
+ from app.core.observation import Observation
13
+
14
+ router = APIRouter(prefix="/episode")
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ class ResetRequest(BaseModel):
19
+ """Request model for resetting an episode."""
20
+
21
+ task_id: str = Field(..., description="ID of the task to execute")
22
+ seed: int | None = Field(default=None, description="Random seed for reproducibility")
23
+ config: dict[str, Any] | None = Field(default=None, description="Episode configuration overrides")
24
+
25
+
26
+ class ResetResponse(BaseModel):
27
+ """Response model for episode reset."""
28
+
29
+ episode_id: str
30
+ task_id: str
31
+ observation: Observation
32
+ info: dict[str, Any]
33
+
34
+
35
+ class StepRequest(BaseModel):
36
+ """Request model for taking a step."""
37
+
38
+ episode_id: str = Field(..., description="ID of the episode")
39
+ action: Action = Field(..., description="Action to execute")
40
+
41
+
42
+ class StepResponse(BaseModel):
43
+ """Response model for step execution."""
44
+
45
+ observation: Observation
46
+ reward: float
47
+ reward_breakdown: dict[str, float]
48
+ terminated: bool
49
+ truncated: bool
50
+ info: dict[str, Any]
51
+
52
+
53
+ class EpisodeState(BaseModel):
54
+ """Current state of an episode."""
55
+
56
+ episode_id: str
57
+ task_id: str
58
+ step_number: int
59
+ current_url: str | None
60
+ is_terminal: bool
61
+ total_reward: float
62
+ extracted_data: dict[str, Any]
63
+
64
+
65
+ class EpisodeListResponse(BaseModel):
66
+ """Response model for listing episodes."""
67
+
68
+ episodes: list[str]
69
+ count: int
70
+
71
+
72
+ @router.post(
73
+ "/reset",
74
+ response_model=ResetResponse,
75
+ status_code=status.HTTP_201_CREATED,
76
+ summary="Reset/create new episode",
77
+ description="Create a new episode for a given task",
78
+ )
79
+ async def reset_episode(
80
+ request: ResetRequest,
81
+ settings: SettingsDep,
82
+ ) -> ResetResponse:
83
+ """
84
+ Reset and initialize a new episode.
85
+
86
+ Args:
87
+ request: Reset request containing task_id and optional seed.
88
+ settings: Application settings.
89
+
90
+ Returns:
91
+ ResetResponse: New episode ID and initial observation.
92
+ """
93
+ episode_id = str(uuid4())
94
+ logger.info(f"Creating new episode {episode_id} for task {request.task_id}")
95
+
96
+ try:
97
+ env = create_environment(episode_id, settings)
98
+ observation, info = await env.reset(
99
+ task_id=request.task_id,
100
+ seed=request.seed,
101
+ config=request.config,
102
+ )
103
+
104
+ return ResetResponse(
105
+ episode_id=episode_id,
106
+ task_id=request.task_id,
107
+ observation=observation,
108
+ info=info,
109
+ )
110
+ except Exception as e:
111
+ logger.error(f"Failed to reset episode: {e}")
112
+ remove_environment(episode_id)
113
+ raise HTTPException(
114
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
115
+ detail=f"Failed to create episode: {str(e)}",
116
+ )
117
+
118
+
119
+ @router.post(
120
+ "/step",
121
+ response_model=StepResponse,
122
+ status_code=status.HTTP_200_OK,
123
+ summary="Execute action step",
124
+ description="Execute an action in the episode and receive observation and reward",
125
+ )
126
+ async def step_episode(request: StepRequest) -> StepResponse:
127
+ """
128
+ Execute an action step in the episode.
129
+
130
+ Args:
131
+ request: Step request containing episode_id and action.
132
+
133
+ Returns:
134
+ StepResponse: New observation, reward, and termination status.
135
+ """
136
+ logger.info(f"Step in episode {request.episode_id}: {request.action.action_type}")
137
+
138
+ env = get_environment(request.episode_id)
139
+
140
+ try:
141
+ observation, reward, reward_breakdown, terminated, truncated, info = await env.step(
142
+ request.action
143
+ )
144
+
145
+ # Clean up if episode is done
146
+ if terminated or truncated:
147
+ logger.info(f"Episode {request.episode_id} completed")
148
+
149
+ return StepResponse(
150
+ observation=observation,
151
+ reward=reward,
152
+ reward_breakdown=reward_breakdown,
153
+ terminated=terminated,
154
+ truncated=truncated,
155
+ info=info,
156
+ )
157
+ except Exception as e:
158
+ logger.error(f"Step failed: {e}")
159
+ raise HTTPException(
160
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
161
+ detail=f"Step execution failed: {str(e)}",
162
+ )
163
+
164
+
165
+ @router.get(
166
+ "/state/{episode_id}",
167
+ response_model=EpisodeState,
168
+ status_code=status.HTTP_200_OK,
169
+ summary="Get episode state",
170
+ description="Get the current state of an episode",
171
+ )
172
+ async def get_episode_state(episode_id: str) -> EpisodeState:
173
+ """
174
+ Get the current state of an episode.
175
+
176
+ Args:
177
+ episode_id: ID of the episode.
178
+
179
+ Returns:
180
+ EpisodeState: Current episode state.
181
+ """
182
+ env = get_environment(episode_id)
183
+ state = env.get_state()
184
+
185
+ return EpisodeState(
186
+ episode_id=episode_id,
187
+ task_id=state["task_id"],
188
+ step_number=state["step_number"],
189
+ current_url=state["current_url"],
190
+ is_terminal=state["is_terminal"],
191
+ total_reward=state["total_reward"],
192
+ extracted_data=state["extracted_data"],
193
+ )
194
+
195
+
196
+ @router.delete(
197
+ "/{episode_id}",
198
+ status_code=status.HTTP_204_NO_CONTENT,
199
+ summary="Delete episode",
200
+ description="Clean up and delete an episode",
201
+ )
202
+ async def delete_episode(episode_id: str) -> None:
203
+ """
204
+ Delete an episode and clean up resources.
205
+
206
+ Args:
207
+ episode_id: ID of the episode to delete.
208
+ """
209
+ if not remove_environment(episode_id):
210
+ raise HTTPException(
211
+ status_code=status.HTTP_404_NOT_FOUND,
212
+ detail=f"Episode {episode_id} not found",
213
+ )
214
+ logger.info(f"Deleted episode {episode_id}")
215
+
216
+
217
+ @router.get(
218
+ "/",
219
+ response_model=EpisodeListResponse,
220
+ status_code=status.HTTP_200_OK,
221
+ summary="List episodes",
222
+ description="List all active episodes",
223
+ )
224
+ async def list_episodes() -> EpisodeListResponse:
225
+ """
226
+ List all active episodes.
227
+
228
+ Returns:
229
+ EpisodeListResponse: List of active episode IDs.
230
+ """
231
+ episodes = list_environments()
232
+ return EpisodeListResponse(
233
+ episodes=episodes,
234
+ count=len(episodes),
235
+ )
backend/app/api/routes/health.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Health check endpoints."""
2
+
3
+ import logging
4
+ from datetime import datetime, timezone
5
+ from typing import Any
6
+
7
+ from fastapi import APIRouter, status
8
+ from pydantic import BaseModel
9
+
10
+ from app.config import get_settings
11
+
12
+ router = APIRouter()
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ class HealthResponse(BaseModel):
17
+ """Health check response model."""
18
+
19
+ status: str
20
+ timestamp: str
21
+ version: str
22
+ uptime_seconds: float | None = None
23
+
24
+
25
+ class ReadyResponse(BaseModel):
26
+ """Readiness check response model."""
27
+
28
+ ready: bool
29
+ checks: dict[str, bool]
30
+ details: dict[str, Any] | None = None
31
+
32
+
33
+ # Track startup time
34
+ _startup_time: datetime | None = None
35
+
36
+
37
+ def set_startup_time() -> None:
38
+ """Set the startup time for uptime calculation."""
39
+ global _startup_time
40
+ _startup_time = datetime.now(timezone.utc)
41
+
42
+
43
+ @router.get(
44
+ "/health",
45
+ response_model=HealthResponse,
46
+ status_code=status.HTTP_200_OK,
47
+ summary="Health check",
48
+ description="Basic health check endpoint",
49
+ )
50
+ async def health_check() -> HealthResponse:
51
+ """
52
+ Perform a basic health check.
53
+
54
+ Returns:
55
+ HealthResponse: Current health status of the application.
56
+ """
57
+ settings = get_settings()
58
+ now = datetime.now(timezone.utc)
59
+
60
+ uptime = None
61
+ if _startup_time:
62
+ uptime = (now - _startup_time).total_seconds()
63
+
64
+ return HealthResponse(
65
+ status="healthy",
66
+ timestamp=now.isoformat(),
67
+ version=settings.app_version,
68
+ uptime_seconds=uptime,
69
+ )
70
+
71
+
72
+ @router.get(
73
+ "/ready",
74
+ response_model=ReadyResponse,
75
+ status_code=status.HTTP_200_OK,
76
+ summary="Readiness check",
77
+ description="Check if the application is ready to serve requests",
78
+ )
79
+ async def readiness_check() -> ReadyResponse:
80
+ """
81
+ Perform a readiness check.
82
+
83
+ Checks:
84
+ - Memory manager availability
85
+ - Model router availability
86
+ - Tool registry availability
87
+
88
+ Returns:
89
+ ReadyResponse: Readiness status with individual check results.
90
+ """
91
+ checks: dict[str, bool] = {}
92
+ details: dict[str, Any] = {}
93
+
94
+ # Check memory manager
95
+ try:
96
+ from app.main import get_memory_manager
97
+ memory_manager = get_memory_manager()
98
+ checks["memory_manager"] = memory_manager is not None
99
+ except Exception as e:
100
+ checks["memory_manager"] = False
101
+ details["memory_manager_error"] = str(e)
102
+
103
+ # Check model router
104
+ try:
105
+ from app.main import get_model_router
106
+ model_router = get_model_router()
107
+ checks["model_router"] = model_router is not None
108
+ if model_router:
109
+ details["available_providers"] = model_router.list_providers()
110
+ except Exception as e:
111
+ checks["model_router"] = False
112
+ details["model_router_error"] = str(e)
113
+
114
+ # Check tool registry
115
+ try:
116
+ from app.main import get_tool_registry
117
+ tool_registry = get_tool_registry()
118
+ checks["tool_registry"] = tool_registry is not None
119
+ if tool_registry:
120
+ details["registered_tools"] = len(tool_registry.list_tools())
121
+ except Exception as e:
122
+ checks["tool_registry"] = False
123
+ details["tool_registry_error"] = str(e)
124
+
125
+ all_ready = all(checks.values())
126
+
127
+ return ReadyResponse(
128
+ ready=all_ready,
129
+ checks=checks,
130
+ details=details if details else None,
131
+ )
132
+
133
+
134
+ @router.get(
135
+ "/ping",
136
+ status_code=status.HTTP_200_OK,
137
+ summary="Ping endpoint",
138
+ description="Simple ping endpoint for load balancers",
139
+ )
140
+ async def ping() -> dict[str, str]:
141
+ """Simple ping endpoint."""
142
+ return {"ping": "pong"}
backend/app/api/routes/memory.py ADDED
@@ -0,0 +1,344 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Memory management endpoints."""
2
+
3
+ import logging
4
+ from datetime import datetime, timezone
5
+ from enum import Enum
6
+ from typing import Any
7
+ from uuid import uuid4
8
+
9
+ from fastapi import APIRouter, HTTPException, status
10
+ from pydantic import BaseModel, Field
11
+
12
+ router = APIRouter(prefix="/memory")
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ class MemoryType(str, Enum):
17
+ """Types of memory layers."""
18
+
19
+ SHORT_TERM = "short_term"
20
+ WORKING = "working"
21
+ LONG_TERM = "long_term"
22
+ SHARED = "shared"
23
+
24
+
25
+ class MemoryEntry(BaseModel):
26
+ """A single memory entry."""
27
+
28
+ id: str
29
+ memory_type: MemoryType
30
+ content: dict[str, Any]
31
+ metadata: dict[str, Any] = Field(default_factory=dict)
32
+ timestamp: str
33
+ episode_id: str | None = None
34
+ agent_id: str | None = None
35
+ relevance_score: float | None = None
36
+ embedding: list[float] | None = None
37
+
38
+
39
+ class MemoryQueryRequest(BaseModel):
40
+ """Request for querying memory."""
41
+
42
+ query: str
43
+ memory_types: list[MemoryType] = Field(default_factory=lambda: list(MemoryType))
44
+ episode_id: str | None = None
45
+ limit: int = 10
46
+ min_relevance: float = 0.0
47
+
48
+
49
+ class MemoryQueryResponse(BaseModel):
50
+ """Response from memory query."""
51
+
52
+ entries: list[MemoryEntry]
53
+ total_found: int
54
+ query: str
55
+
56
+
57
+ class MemoryStoreRequest(BaseModel):
58
+ """Request to store a memory entry."""
59
+
60
+ memory_type: MemoryType
61
+ content: dict[str, Any]
62
+ metadata: dict[str, Any] = Field(default_factory=dict)
63
+ episode_id: str | None = None
64
+ agent_id: str | None = None
65
+
66
+
67
+ class MemoryStats(BaseModel):
68
+ """Statistics about memory usage."""
69
+
70
+ short_term_count: int
71
+ working_count: int
72
+ long_term_count: int
73
+ shared_count: int
74
+ total_count: int
75
+ oldest_entry: str | None = None
76
+ newest_entry: str | None = None
77
+
78
+
79
+ # In-memory storage (would use actual memory layers in production)
80
+ _memory_store: dict[str, MemoryEntry] = {}
81
+
82
+
83
+ @router.post(
84
+ "/store",
85
+ response_model=MemoryEntry,
86
+ status_code=status.HTTP_201_CREATED,
87
+ summary="Store memory entry",
88
+ description="Store a new memory entry",
89
+ )
90
+ async def store_memory(request: MemoryStoreRequest) -> MemoryEntry:
91
+ """
92
+ Store a new memory entry.
93
+
94
+ Args:
95
+ request: Memory storage request.
96
+
97
+ Returns:
98
+ MemoryEntry: Stored memory entry.
99
+ """
100
+ entry_id = str(uuid4())
101
+ timestamp = datetime.now(timezone.utc).isoformat()
102
+
103
+ entry = MemoryEntry(
104
+ id=entry_id,
105
+ memory_type=request.memory_type,
106
+ content=request.content,
107
+ metadata=request.metadata,
108
+ timestamp=timestamp,
109
+ episode_id=request.episode_id,
110
+ agent_id=request.agent_id,
111
+ )
112
+
113
+ _memory_store[entry_id] = entry
114
+ logger.info(f"Stored memory entry {entry_id} ({request.memory_type})")
115
+
116
+ return entry
117
+
118
+
119
+ @router.post(
120
+ "/query",
121
+ response_model=MemoryQueryResponse,
122
+ status_code=status.HTTP_200_OK,
123
+ summary="Query memory",
124
+ description="Query memory entries by semantic similarity or filters",
125
+ )
126
+ async def query_memory(request: MemoryQueryRequest) -> MemoryQueryResponse:
127
+ """
128
+ Query memory entries.
129
+
130
+ Args:
131
+ request: Memory query request.
132
+
133
+ Returns:
134
+ MemoryQueryResponse: Matching memory entries.
135
+ """
136
+ logger.info(f"Querying memory: '{request.query[:50]}...'")
137
+
138
+ # Filter entries
139
+ entries = list(_memory_store.values())
140
+
141
+ # Filter by memory type
142
+ if request.memory_types:
143
+ entries = [e for e in entries if e.memory_type in request.memory_types]
144
+
145
+ # Filter by episode
146
+ if request.episode_id:
147
+ entries = [e for e in entries if e.episode_id == request.episode_id]
148
+
149
+ # Simple text matching (would use embeddings in production)
150
+ query_lower = request.query.lower()
151
+ scored_entries = []
152
+ for entry in entries:
153
+ content_str = str(entry.content).lower()
154
+ if query_lower in content_str:
155
+ score = content_str.count(query_lower) / len(content_str.split())
156
+ entry.relevance_score = min(score * 10, 1.0)
157
+ if entry.relevance_score >= request.min_relevance:
158
+ scored_entries.append(entry)
159
+
160
+ # Sort by relevance and limit
161
+ scored_entries.sort(key=lambda e: e.relevance_score or 0, reverse=True)
162
+ result_entries = scored_entries[: request.limit]
163
+
164
+ return MemoryQueryResponse(
165
+ entries=result_entries,
166
+ total_found=len(scored_entries),
167
+ query=request.query,
168
+ )
169
+
170
+
171
+ @router.get(
172
+ "/{entry_id}",
173
+ response_model=MemoryEntry,
174
+ status_code=status.HTTP_200_OK,
175
+ summary="Get memory entry",
176
+ description="Get a specific memory entry by ID",
177
+ )
178
+ async def get_memory_entry(entry_id: str) -> MemoryEntry:
179
+ """
180
+ Get a specific memory entry.
181
+
182
+ Args:
183
+ entry_id: ID of the memory entry.
184
+
185
+ Returns:
186
+ MemoryEntry: The memory entry.
187
+ """
188
+ if entry_id not in _memory_store:
189
+ raise HTTPException(
190
+ status_code=status.HTTP_404_NOT_FOUND,
191
+ detail=f"Memory entry {entry_id} not found",
192
+ )
193
+ return _memory_store[entry_id]
194
+
195
+
196
+ @router.put(
197
+ "/{entry_id}",
198
+ response_model=MemoryEntry,
199
+ status_code=status.HTTP_200_OK,
200
+ summary="Update memory entry",
201
+ description="Update an existing memory entry",
202
+ )
203
+ async def update_memory_entry(
204
+ entry_id: str,
205
+ content: dict[str, Any],
206
+ metadata: dict[str, Any] | None = None,
207
+ ) -> MemoryEntry:
208
+ """
209
+ Update a memory entry.
210
+
211
+ Args:
212
+ entry_id: ID of the entry to update.
213
+ content: New content.
214
+ metadata: Optional new metadata.
215
+
216
+ Returns:
217
+ MemoryEntry: Updated entry.
218
+ """
219
+ if entry_id not in _memory_store:
220
+ raise HTTPException(
221
+ status_code=status.HTTP_404_NOT_FOUND,
222
+ detail=f"Memory entry {entry_id} not found",
223
+ )
224
+
225
+ entry = _memory_store[entry_id]
226
+ entry.content = content
227
+ if metadata:
228
+ entry.metadata.update(metadata)
229
+ entry.timestamp = datetime.now(timezone.utc).isoformat()
230
+
231
+ logger.info(f"Updated memory entry {entry_id}")
232
+ return entry
233
+
234
+
235
+ @router.delete(
236
+ "/{entry_id}",
237
+ status_code=status.HTTP_204_NO_CONTENT,
238
+ summary="Delete memory entry",
239
+ description="Delete a memory entry",
240
+ )
241
+ async def delete_memory_entry(entry_id: str) -> None:
242
+ """
243
+ Delete a memory entry.
244
+
245
+ Args:
246
+ entry_id: ID of the entry to delete.
247
+ """
248
+ if entry_id not in _memory_store:
249
+ raise HTTPException(
250
+ status_code=status.HTTP_404_NOT_FOUND,
251
+ detail=f"Memory entry {entry_id} not found",
252
+ )
253
+
254
+ del _memory_store[entry_id]
255
+ logger.info(f"Deleted memory entry {entry_id}")
256
+
257
+
258
+ @router.get(
259
+ "/stats/overview",
260
+ response_model=MemoryStats,
261
+ status_code=status.HTTP_200_OK,
262
+ summary="Get memory stats",
263
+ description="Get statistics about memory usage",
264
+ )
265
+ async def get_memory_stats() -> MemoryStats:
266
+ """
267
+ Get memory statistics.
268
+
269
+ Returns:
270
+ MemoryStats: Memory usage statistics.
271
+ """
272
+ entries = list(_memory_store.values())
273
+
274
+ counts = {mt: 0 for mt in MemoryType}
275
+ for entry in entries:
276
+ counts[entry.memory_type] += 1
277
+
278
+ timestamps = [e.timestamp for e in entries]
279
+
280
+ return MemoryStats(
281
+ short_term_count=counts[MemoryType.SHORT_TERM],
282
+ working_count=counts[MemoryType.WORKING],
283
+ long_term_count=counts[MemoryType.LONG_TERM],
284
+ shared_count=counts[MemoryType.SHARED],
285
+ total_count=len(entries),
286
+ oldest_entry=min(timestamps) if timestamps else None,
287
+ newest_entry=max(timestamps) if timestamps else None,
288
+ )
289
+
290
+
291
+ @router.delete(
292
+ "/clear/{memory_type}",
293
+ status_code=status.HTTP_204_NO_CONTENT,
294
+ summary="Clear memory layer",
295
+ description="Clear all entries from a memory layer",
296
+ )
297
+ async def clear_memory_layer(memory_type: MemoryType) -> None:
298
+ """
299
+ Clear all entries from a memory layer.
300
+
301
+ Args:
302
+ memory_type: Type of memory to clear.
303
+ """
304
+ global _memory_store
305
+ to_delete = [k for k, v in _memory_store.items() if v.memory_type == memory_type]
306
+ for key in to_delete:
307
+ del _memory_store[key]
308
+ logger.info(f"Cleared {len(to_delete)} entries from {memory_type}")
309
+
310
+
311
+ @router.post(
312
+ "/consolidate",
313
+ status_code=status.HTTP_200_OK,
314
+ summary="Consolidate memory",
315
+ description="Consolidate short-term memory into long-term memory",
316
+ )
317
+ async def consolidate_memory(episode_id: str | None = None) -> dict[str, Any]:
318
+ """
319
+ Consolidate memory from short-term to long-term.
320
+
321
+ Args:
322
+ episode_id: Optional episode to consolidate.
323
+
324
+ Returns:
325
+ Consolidation result.
326
+ """
327
+ entries = list(_memory_store.values())
328
+
329
+ if episode_id:
330
+ entries = [e for e in entries if e.episode_id == episode_id]
331
+
332
+ short_term = [e for e in entries if e.memory_type == MemoryType.SHORT_TERM]
333
+
334
+ consolidated = 0
335
+ for entry in short_term:
336
+ entry.memory_type = MemoryType.LONG_TERM
337
+ consolidated += 1
338
+
339
+ logger.info(f"Consolidated {consolidated} entries to long-term memory")
340
+
341
+ return {
342
+ "consolidated_count": consolidated,
343
+ "episode_id": episode_id,
344
+ }
backend/app/api/routes/tasks.py ADDED
@@ -0,0 +1,254 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tasks management endpoints."""
2
+
3
+ import logging
4
+ from enum import Enum
5
+ from typing import Any
6
+
7
+ from fastapi import APIRouter, HTTPException, status
8
+ from pydantic import BaseModel, Field
9
+
10
+ router = APIRouter(prefix="/tasks")
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ class TaskDifficulty(str, Enum):
15
+ """Task difficulty levels."""
16
+
17
+ EASY = "easy"
18
+ MEDIUM = "medium"
19
+ HARD = "hard"
20
+ EXPERT = "expert"
21
+
22
+
23
+ class TaskType(str, Enum):
24
+ """Types of scraping tasks."""
25
+
26
+ SINGLE_PAGE = "single_page"
27
+ MULTI_PAGE = "multi_page"
28
+ SEARCH_EXTRACT = "search_extract"
29
+ FORM_FILL = "form_fill"
30
+ DYNAMIC_CONTENT = "dynamic_content"
31
+ AUTHENTICATION = "authentication"
32
+
33
+
34
+ class FieldSchema(BaseModel):
35
+ """Schema for a field to extract."""
36
+
37
+ name: str
38
+ description: str
39
+ field_type: str = "string"
40
+ required: bool = True
41
+ validation_pattern: str | None = None
42
+
43
+
44
+ class Task(BaseModel):
45
+ """A scraping task definition."""
46
+
47
+ id: str
48
+ name: str
49
+ description: str
50
+ task_type: TaskType
51
+ difficulty: TaskDifficulty
52
+ target_url: str | None = None
53
+ target_domain: str | None = None
54
+ fields_to_extract: list[FieldSchema]
55
+ success_criteria: dict[str, Any]
56
+ hints: list[str] = Field(default_factory=list)
57
+ max_steps: int = 50
58
+ timeout_seconds: float = 300.0
59
+ tags: list[str] = Field(default_factory=list)
60
+
61
+
62
+ class TaskListResponse(BaseModel):
63
+ """Response for listing tasks."""
64
+
65
+ tasks: list[Task]
66
+ total: int
67
+ page: int
68
+ page_size: int
69
+
70
+
71
+ class TaskProgress(BaseModel):
72
+ """Progress on a task within an episode."""
73
+
74
+ task_id: str
75
+ fields_extracted: int
76
+ fields_total: int
77
+ steps_taken: int
78
+ max_steps: int
79
+ accuracy_estimate: float
80
+ completion_percentage: float
81
+
82
+
83
+ # Sample task repository (would be database-backed in production)
84
+ TASK_REPOSITORY: dict[str, Task] = {
85
+ "task_001": Task(
86
+ id="task_001",
87
+ name="Extract Product Details",
88
+ description="Extract product name, price, and description from an e-commerce page",
89
+ task_type=TaskType.SINGLE_PAGE,
90
+ difficulty=TaskDifficulty.EASY,
91
+ target_url="https://example.com/product/123",
92
+ fields_to_extract=[
93
+ FieldSchema(name="product_name", description="The name of the product"),
94
+ FieldSchema(name="price", description="Current price", field_type="number"),
95
+ FieldSchema(name="description", description="Product description"),
96
+ ],
97
+ success_criteria={"min_accuracy": 0.9, "required_fields": ["product_name", "price"]},
98
+ hints=["Look for h1 tags for product name", "Price often in span with class containing 'price'"],
99
+ tags=["ecommerce", "product"],
100
+ ),
101
+ "task_002": Task(
102
+ id="task_002",
103
+ name="Search and Extract Company Info",
104
+ description="Search for a company and extract key information from search results",
105
+ task_type=TaskType.SEARCH_EXTRACT,
106
+ difficulty=TaskDifficulty.MEDIUM,
107
+ target_domain="linkedin.com",
108
+ fields_to_extract=[
109
+ FieldSchema(name="company_name", description="Official company name"),
110
+ FieldSchema(name="industry", description="Primary industry"),
111
+ FieldSchema(name="employee_count", description="Number of employees", field_type="string"),
112
+ FieldSchema(name="headquarters", description="Location of headquarters"),
113
+ ],
114
+ success_criteria={"min_accuracy": 0.8, "required_fields": ["company_name", "industry"]},
115
+ tags=["search", "company", "linkedin"],
116
+ max_steps=30,
117
+ ),
118
+ "task_003": Task(
119
+ id="task_003",
120
+ name="Multi-page Article Extraction",
121
+ description="Navigate through paginated articles and extract all content",
122
+ task_type=TaskType.MULTI_PAGE,
123
+ difficulty=TaskDifficulty.HARD,
124
+ target_domain="news-site.example.com",
125
+ fields_to_extract=[
126
+ FieldSchema(name="articles", description="List of article data", field_type="array"),
127
+ ],
128
+ success_criteria={"min_articles": 10, "min_accuracy": 0.85},
129
+ tags=["pagination", "articles", "news"],
130
+ max_steps=100,
131
+ ),
132
+ }
133
+
134
+
135
+ @router.get(
136
+ "/",
137
+ response_model=TaskListResponse,
138
+ status_code=status.HTTP_200_OK,
139
+ summary="List available tasks",
140
+ description="Get a paginated list of available scraping tasks",
141
+ )
142
+ async def list_tasks(
143
+ page: int = 1,
144
+ page_size: int = 20,
145
+ difficulty: TaskDifficulty | None = None,
146
+ task_type: TaskType | None = None,
147
+ tag: str | None = None,
148
+ ) -> TaskListResponse:
149
+ """
150
+ List available tasks with optional filtering.
151
+
152
+ Args:
153
+ page: Page number (1-indexed).
154
+ page_size: Number of tasks per page.
155
+ difficulty: Filter by difficulty level.
156
+ task_type: Filter by task type.
157
+ tag: Filter by tag.
158
+
159
+ Returns:
160
+ TaskListResponse: Paginated list of tasks.
161
+ """
162
+ tasks = list(TASK_REPOSITORY.values())
163
+
164
+ # Apply filters
165
+ if difficulty:
166
+ tasks = [t for t in tasks if t.difficulty == difficulty]
167
+ if task_type:
168
+ tasks = [t for t in tasks if t.task_type == task_type]
169
+ if tag:
170
+ tasks = [t for t in tasks if tag in t.tags]
171
+
172
+ # Paginate
173
+ total = len(tasks)
174
+ start = (page - 1) * page_size
175
+ end = start + page_size
176
+ paginated_tasks = tasks[start:end]
177
+
178
+ return TaskListResponse(
179
+ tasks=paginated_tasks,
180
+ total=total,
181
+ page=page,
182
+ page_size=page_size,
183
+ )
184
+
185
+
186
+ @router.get(
187
+ "/{task_id}",
188
+ response_model=Task,
189
+ status_code=status.HTTP_200_OK,
190
+ summary="Get task details",
191
+ description="Get details of a specific task",
192
+ )
193
+ async def get_task(task_id: str) -> Task:
194
+ """
195
+ Get details of a specific task.
196
+
197
+ Args:
198
+ task_id: ID of the task.
199
+
200
+ Returns:
201
+ Task: Task details.
202
+ """
203
+ if task_id not in TASK_REPOSITORY:
204
+ raise HTTPException(
205
+ status_code=status.HTTP_404_NOT_FOUND,
206
+ detail=f"Task {task_id} not found",
207
+ )
208
+ return TASK_REPOSITORY[task_id]
209
+
210
+
211
+ @router.post(
212
+ "/",
213
+ response_model=Task,
214
+ status_code=status.HTTP_201_CREATED,
215
+ summary="Create a new task",
216
+ description="Create a new scraping task",
217
+ )
218
+ async def create_task(task: Task) -> Task:
219
+ """
220
+ Create a new task.
221
+
222
+ Args:
223
+ task: Task definition.
224
+
225
+ Returns:
226
+ Task: Created task.
227
+ """
228
+ if task.id in TASK_REPOSITORY:
229
+ raise HTTPException(
230
+ status_code=status.HTTP_409_CONFLICT,
231
+ detail=f"Task {task.id} already exists",
232
+ )
233
+ TASK_REPOSITORY[task.id] = task
234
+ logger.info(f"Created task {task.id}: {task.name}")
235
+ return task
236
+
237
+
238
+ @router.get(
239
+ "/types/",
240
+ status_code=status.HTTP_200_OK,
241
+ summary="Get task types",
242
+ description="Get all available task types",
243
+ )
244
+ async def get_task_types() -> dict[str, list[str]]:
245
+ """
246
+ Get available task types and difficulties.
247
+
248
+ Returns:
249
+ Dict with task types and difficulties.
250
+ """
251
+ return {
252
+ "task_types": [t.value for t in TaskType],
253
+ "difficulties": [d.value for d in TaskDifficulty],
254
+ }
backend/app/api/routes/tools.py ADDED
@@ -0,0 +1,333 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tool registry and testing endpoints."""
2
+
3
+ import logging
4
+ from typing import Any
5
+
6
+ from fastapi import APIRouter, HTTPException, status
7
+ from pydantic import BaseModel, Field
8
+
9
+ router = APIRouter(prefix="/tools")
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ class ToolParameter(BaseModel):
14
+ """Parameter definition for a tool."""
15
+
16
+ name: str
17
+ type: str
18
+ description: str
19
+ required: bool = True
20
+ default: Any | None = None
21
+
22
+
23
+ class ToolDefinition(BaseModel):
24
+ """Definition of a tool in the registry."""
25
+
26
+ name: str
27
+ description: str
28
+ category: str
29
+ parameters: list[ToolParameter]
30
+ returns: str
31
+ examples: list[dict[str, Any]] = Field(default_factory=list)
32
+ requires_browser: bool = False
33
+ cost_estimate: float = 0.0
34
+
35
+
36
+ class ToolRegistryResponse(BaseModel):
37
+ """Response containing the tool registry."""
38
+
39
+ tools: list[ToolDefinition]
40
+ categories: list[str]
41
+ total_count: int
42
+
43
+
44
+ class ToolTestRequest(BaseModel):
45
+ """Request to test a tool."""
46
+
47
+ tool_name: str
48
+ parameters: dict[str, Any] = Field(default_factory=dict)
49
+ dry_run: bool = True
50
+
51
+
52
+ class ToolTestResponse(BaseModel):
53
+ """Response from tool testing."""
54
+
55
+ tool_name: str
56
+ success: bool
57
+ result: Any | None = None
58
+ error: str | None = None
59
+ execution_time_ms: float = 0.0
60
+ dry_run: bool
61
+
62
+
63
+ # Tool definitions (would be dynamically registered in production)
64
+ TOOL_DEFINITIONS: list[ToolDefinition] = [
65
+ ToolDefinition(
66
+ name="navigate_to",
67
+ description="Navigate the browser to a specified URL",
68
+ category="browser",
69
+ parameters=[
70
+ ToolParameter(name="url", type="string", description="URL to navigate to"),
71
+ ToolParameter(name="wait_for", type="string", description="CSS selector to wait for", required=False),
72
+ ],
73
+ returns="NavigationResult with page info",
74
+ requires_browser=True,
75
+ cost_estimate=0.01,
76
+ ),
77
+ ToolDefinition(
78
+ name="click_element",
79
+ description="Click on an element identified by selector",
80
+ category="browser",
81
+ parameters=[
82
+ ToolParameter(name="selector", type="string", description="CSS selector of element to click"),
83
+ ],
84
+ returns="ClickResult with success status",
85
+ requires_browser=True,
86
+ cost_estimate=0.005,
87
+ ),
88
+ ToolDefinition(
89
+ name="extract_text",
90
+ description="Extract text content from elements",
91
+ category="extraction",
92
+ parameters=[
93
+ ToolParameter(name="selector", type="string", description="CSS selector to extract from"),
94
+ ToolParameter(name="multiple", type="boolean", description="Extract from all matches", default=False),
95
+ ],
96
+ returns="Extracted text or list of texts",
97
+ requires_browser=True,
98
+ cost_estimate=0.002,
99
+ ),
100
+ ToolDefinition(
101
+ name="extract_attribute",
102
+ description="Extract attribute value from element",
103
+ category="extraction",
104
+ parameters=[
105
+ ToolParameter(name="selector", type="string", description="CSS selector"),
106
+ ToolParameter(name="attribute", type="string", description="Attribute name to extract"),
107
+ ],
108
+ returns="Attribute value",
109
+ requires_browser=True,
110
+ cost_estimate=0.002,
111
+ ),
112
+ ToolDefinition(
113
+ name="search_engine",
114
+ description="Perform a search using a search engine",
115
+ category="search",
116
+ parameters=[
117
+ ToolParameter(name="query", type="string", description="Search query"),
118
+ ToolParameter(name="engine", type="string", description="Search engine", default="google"),
119
+ ToolParameter(name="num_results", type="integer", description="Number of results", default=10),
120
+ ],
121
+ returns="List of search results",
122
+ cost_estimate=0.05,
123
+ ),
124
+ ToolDefinition(
125
+ name="fill_form",
126
+ description="Fill a form field with a value",
127
+ category="browser",
128
+ parameters=[
129
+ ToolParameter(name="selector", type="string", description="CSS selector of form field"),
130
+ ToolParameter(name="value", type="string", description="Value to fill"),
131
+ ],
132
+ returns="FillResult with success status",
133
+ requires_browser=True,
134
+ cost_estimate=0.005,
135
+ ),
136
+ ToolDefinition(
137
+ name="screenshot",
138
+ description="Take a screenshot of the current page",
139
+ category="browser",
140
+ parameters=[
141
+ ToolParameter(name="full_page", type="boolean", description="Capture full page", default=False),
142
+ ],
143
+ returns="Base64 encoded screenshot",
144
+ requires_browser=True,
145
+ cost_estimate=0.01,
146
+ ),
147
+ ToolDefinition(
148
+ name="get_page_html",
149
+ description="Get the full HTML content of the current page",
150
+ category="extraction",
151
+ parameters=[],
152
+ returns="HTML string",
153
+ requires_browser=True,
154
+ cost_estimate=0.001,
155
+ ),
156
+ ToolDefinition(
157
+ name="wait_for_selector",
158
+ description="Wait for an element to appear on the page",
159
+ category="browser",
160
+ parameters=[
161
+ ToolParameter(name="selector", type="string", description="CSS selector to wait for"),
162
+ ToolParameter(name="timeout_ms", type="integer", description="Timeout in milliseconds", default=30000),
163
+ ],
164
+ returns="Boolean indicating if element appeared",
165
+ requires_browser=True,
166
+ cost_estimate=0.001,
167
+ ),
168
+ ToolDefinition(
169
+ name="scroll_to",
170
+ description="Scroll to a position or element",
171
+ category="browser",
172
+ parameters=[
173
+ ToolParameter(name="selector", type="string", description="CSS selector", required=False),
174
+ ToolParameter(name="position", type="string", description="Position: top, bottom, or pixel value", required=False),
175
+ ],
176
+ returns="ScrollResult",
177
+ requires_browser=True,
178
+ cost_estimate=0.001,
179
+ ),
180
+ ]
181
+
182
+
183
+ @router.get(
184
+ "/registry",
185
+ response_model=ToolRegistryResponse,
186
+ status_code=status.HTTP_200_OK,
187
+ summary="Get tool registry",
188
+ description="Get all available tools in the registry",
189
+ )
190
+ async def get_tool_registry(category: str | None = None) -> ToolRegistryResponse:
191
+ """
192
+ Get the tool registry with all available tools.
193
+
194
+ Args:
195
+ category: Optional filter by category.
196
+
197
+ Returns:
198
+ ToolRegistryResponse: List of available tools.
199
+ """
200
+ tools = TOOL_DEFINITIONS
201
+ if category:
202
+ tools = [t for t in tools if t.category == category]
203
+
204
+ categories = list(set(t.category for t in TOOL_DEFINITIONS))
205
+
206
+ return ToolRegistryResponse(
207
+ tools=tools,
208
+ categories=categories,
209
+ total_count=len(tools),
210
+ )
211
+
212
+
213
+ @router.get(
214
+ "/registry/{tool_name}",
215
+ response_model=ToolDefinition,
216
+ status_code=status.HTTP_200_OK,
217
+ summary="Get tool details",
218
+ description="Get details of a specific tool",
219
+ )
220
+ async def get_tool_details(tool_name: str) -> ToolDefinition:
221
+ """
222
+ Get details of a specific tool.
223
+
224
+ Args:
225
+ tool_name: Name of the tool.
226
+
227
+ Returns:
228
+ ToolDefinition: Tool details.
229
+ """
230
+ for tool in TOOL_DEFINITIONS:
231
+ if tool.name == tool_name:
232
+ return tool
233
+ raise HTTPException(
234
+ status_code=status.HTTP_404_NOT_FOUND,
235
+ detail=f"Tool '{tool_name}' not found",
236
+ )
237
+
238
+
239
+ @router.post(
240
+ "/test",
241
+ response_model=ToolTestResponse,
242
+ status_code=status.HTTP_200_OK,
243
+ summary="Test a tool",
244
+ description="Test a tool with provided parameters",
245
+ )
246
+ async def test_tool(request: ToolTestRequest) -> ToolTestResponse:
247
+ """
248
+ Test a tool execution.
249
+
250
+ Args:
251
+ request: Tool test request.
252
+
253
+ Returns:
254
+ ToolTestResponse: Result of tool test.
255
+ """
256
+ import time
257
+
258
+ start_time = time.time()
259
+ logger.info(f"Testing tool '{request.tool_name}' with dry_run={request.dry_run}")
260
+
261
+ # Find the tool
262
+ tool = None
263
+ for t in TOOL_DEFINITIONS:
264
+ if t.name == request.tool_name:
265
+ tool = t
266
+ break
267
+
268
+ if not tool:
269
+ raise HTTPException(
270
+ status_code=status.HTTP_404_NOT_FOUND,
271
+ detail=f"Tool '{request.tool_name}' not found",
272
+ )
273
+
274
+ try:
275
+ # Validate required parameters
276
+ for param in tool.parameters:
277
+ if param.required and param.name not in request.parameters:
278
+ raise ValueError(f"Missing required parameter: {param.name}")
279
+
280
+ if request.dry_run:
281
+ # Return mock result for dry run
282
+ result = {
283
+ "status": "dry_run",
284
+ "tool": request.tool_name,
285
+ "parameters": request.parameters,
286
+ "would_require_browser": tool.requires_browser,
287
+ }
288
+ else:
289
+ # Actually execute the tool (placeholder)
290
+ from app.tools.registry import MCPToolRegistry
291
+ registry = MCPToolRegistry()
292
+ result = await registry.execute_tool(request.tool_name, request.parameters)
293
+
294
+ execution_time = (time.time() - start_time) * 1000
295
+
296
+ return ToolTestResponse(
297
+ tool_name=request.tool_name,
298
+ success=True,
299
+ result=result,
300
+ execution_time_ms=execution_time,
301
+ dry_run=request.dry_run,
302
+ )
303
+ except Exception as e:
304
+ execution_time = (time.time() - start_time) * 1000
305
+ logger.error(f"Tool test failed: {e}")
306
+ return ToolTestResponse(
307
+ tool_name=request.tool_name,
308
+ success=False,
309
+ error=str(e),
310
+ execution_time_ms=execution_time,
311
+ dry_run=request.dry_run,
312
+ )
313
+
314
+
315
+ @router.get(
316
+ "/categories",
317
+ status_code=status.HTTP_200_OK,
318
+ summary="Get tool categories",
319
+ description="Get all tool categories",
320
+ )
321
+ async def get_categories() -> dict[str, list[str]]:
322
+ """
323
+ Get all tool categories.
324
+
325
+ Returns:
326
+ Dict with category information.
327
+ """
328
+ categories = {}
329
+ for tool in TOOL_DEFINITIONS:
330
+ if tool.category not in categories:
331
+ categories[tool.category] = []
332
+ categories[tool.category].append(tool.name)
333
+ return {"categories": categories}
backend/app/utils/__init__.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Utility modules for ScrapeRL backend."""
2
+
3
+ from app.utils.html import (
4
+ parse_html,
5
+ clean_html,
6
+ extract_text,
7
+ semantic_chunk,
8
+ extract_links,
9
+ extract_tables,
10
+ )
11
+ from app.utils.logging import setup_logging, get_logger
12
+
13
+ __all__ = [
14
+ "parse_html",
15
+ "clean_html",
16
+ "extract_text",
17
+ "semantic_chunk",
18
+ "extract_links",
19
+ "extract_tables",
20
+ "setup_logging",
21
+ "get_logger",
22
+ ]
backend/app/utils/__pycache__/__init__.cpython-314.pyc ADDED
Binary file (492 Bytes). View file
 
backend/app/utils/__pycache__/html.cpython-314.pyc ADDED
Binary file (10.7 kB). View file
 
backend/app/utils/__pycache__/logging.cpython-314.pyc ADDED
Binary file (2.54 kB). View file
 
backend/app/utils/html.py ADDED
@@ -0,0 +1,286 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """HTML processing utilities for ScrapeRL backend."""
2
+
3
+ import re
4
+ from typing import Any, Optional
5
+ from bs4 import BeautifulSoup, Tag, NavigableString
6
+
7
+ from app.utils.logging import get_logger
8
+
9
+ logger = get_logger(__name__)
10
+
11
+
12
+ def parse_html(html: str, parser: str = "html.parser") -> BeautifulSoup:
13
+ """
14
+ Parse HTML string into a BeautifulSoup object.
15
+
16
+ Args:
17
+ html: Raw HTML string
18
+ parser: Parser to use (html.parser, lxml, html5lib)
19
+
20
+ Returns:
21
+ Parsed BeautifulSoup object
22
+ """
23
+ return BeautifulSoup(html, parser)
24
+
25
+
26
+ def clean_html(
27
+ html: str,
28
+ remove_scripts: bool = True,
29
+ remove_styles: bool = True,
30
+ remove_comments: bool = True,
31
+ remove_tags: Optional[list[str]] = None,
32
+ ) -> str:
33
+ """
34
+ Clean HTML by removing unwanted elements.
35
+
36
+ Args:
37
+ html: Raw HTML string
38
+ remove_scripts: Remove <script> tags
39
+ remove_styles: Remove <style> tags
40
+ remove_comments: Remove HTML comments
41
+ remove_tags: Additional tags to remove
42
+
43
+ Returns:
44
+ Cleaned HTML string
45
+ """
46
+ soup = parse_html(html)
47
+
48
+ # Remove script tags
49
+ if remove_scripts:
50
+ for script in soup.find_all("script"):
51
+ script.decompose()
52
+
53
+ # Remove style tags
54
+ if remove_styles:
55
+ for style in soup.find_all("style"):
56
+ style.decompose()
57
+
58
+ # Remove comments
59
+ if remove_comments:
60
+ from bs4 import Comment
61
+
62
+ for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
63
+ comment.extract()
64
+
65
+ # Remove additional specified tags
66
+ if remove_tags:
67
+ for tag_name in remove_tags:
68
+ for tag in soup.find_all(tag_name):
69
+ tag.decompose()
70
+
71
+ return str(soup)
72
+
73
+
74
+ def extract_text(
75
+ html: str,
76
+ separator: str = " ",
77
+ strip: bool = True,
78
+ ) -> str:
79
+ """
80
+ Extract plain text from HTML.
81
+
82
+ Args:
83
+ html: Raw HTML string
84
+ separator: String to join text segments
85
+ strip: Strip whitespace from result
86
+
87
+ Returns:
88
+ Extracted plain text
89
+ """
90
+ soup = parse_html(html)
91
+
92
+ # Remove script and style elements
93
+ for element in soup(["script", "style", "noscript"]):
94
+ element.decompose()
95
+
96
+ text = soup.get_text(separator=separator)
97
+
98
+ if strip:
99
+ # Normalize whitespace
100
+ text = re.sub(r"\s+", " ", text).strip()
101
+
102
+ return text
103
+
104
+
105
+ def semantic_chunk(
106
+ html: str,
107
+ max_chunk_size: int = 4000,
108
+ overlap: int = 200,
109
+ ) -> list[dict[str, Any]]:
110
+ """
111
+ Split HTML content into semantic chunks based on structure.
112
+
113
+ Args:
114
+ html: Raw HTML string
115
+ max_chunk_size: Maximum characters per chunk
116
+ overlap: Number of characters to overlap between chunks
117
+
118
+ Returns:
119
+ List of chunk dictionaries with text and metadata
120
+ """
121
+ soup = parse_html(html)
122
+ chunks: list[dict[str, Any]] = []
123
+
124
+ # Remove non-content elements
125
+ for element in soup(["script", "style", "noscript", "nav", "footer", "header"]):
126
+ element.decompose()
127
+
128
+ # Find semantic boundaries
129
+ semantic_tags = ["article", "section", "div", "p", "h1", "h2", "h3", "h4", "h5", "h6"]
130
+
131
+ def get_text_content(element: Tag | NavigableString) -> str:
132
+ if isinstance(element, NavigableString):
133
+ return str(element).strip()
134
+ return element.get_text(separator=" ", strip=True)
135
+
136
+ current_chunk = ""
137
+ current_metadata: dict[str, Any] = {"tags": [], "headings": []}
138
+
139
+ for element in soup.find_all(semantic_tags):
140
+ text = get_text_content(element)
141
+ if not text:
142
+ continue
143
+
144
+ tag_name = element.name if isinstance(element, Tag) else "text"
145
+
146
+ # Check if adding this would exceed max size
147
+ if len(current_chunk) + len(text) + 1 > max_chunk_size:
148
+ if current_chunk:
149
+ chunks.append({
150
+ "text": current_chunk.strip(),
151
+ "metadata": current_metadata.copy(),
152
+ "char_count": len(current_chunk),
153
+ })
154
+ # Start new chunk with overlap
155
+ if overlap > 0 and current_chunk:
156
+ current_chunk = current_chunk[-overlap:] + " " + text
157
+ else:
158
+ current_chunk = text
159
+ current_metadata = {"tags": [tag_name], "headings": []}
160
+ else:
161
+ current_chunk += " " + text if current_chunk else text
162
+ current_metadata["tags"].append(tag_name)
163
+
164
+ # Track headings
165
+ if tag_name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
166
+ current_metadata["headings"].append(text[:100])
167
+
168
+ # Add remaining content
169
+ if current_chunk.strip():
170
+ chunks.append({
171
+ "text": current_chunk.strip(),
172
+ "metadata": current_metadata,
173
+ "char_count": len(current_chunk),
174
+ })
175
+
176
+ # If no semantic chunks found, fall back to simple chunking
177
+ if not chunks:
178
+ text = extract_text(html)
179
+ for i in range(0, len(text), max_chunk_size - overlap):
180
+ chunk_text = text[i : i + max_chunk_size]
181
+ if chunk_text.strip():
182
+ chunks.append({
183
+ "text": chunk_text.strip(),
184
+ "metadata": {"tags": [], "headings": []},
185
+ "char_count": len(chunk_text),
186
+ })
187
+
188
+ return chunks
189
+
190
+
191
+ def extract_links(
192
+ html: str,
193
+ base_url: Optional[str] = None,
194
+ include_text: bool = True,
195
+ ) -> list[dict[str, str]]:
196
+ """
197
+ Extract all links from HTML.
198
+
199
+ Args:
200
+ html: Raw HTML string
201
+ base_url: Base URL for resolving relative links
202
+ include_text: Include link text in results
203
+
204
+ Returns:
205
+ List of link dictionaries with href and optionally text
206
+ """
207
+ from urllib.parse import urljoin
208
+
209
+ soup = parse_html(html)
210
+ links: list[dict[str, str]] = []
211
+
212
+ for anchor in soup.find_all("a", href=True):
213
+ href = anchor.get("href", "")
214
+ if not href or href.startswith("#") or href.startswith("javascript:"):
215
+ continue
216
+
217
+ # Resolve relative URLs
218
+ if base_url and not href.startswith(("http://", "https://", "//")):
219
+ href = urljoin(base_url, href)
220
+
221
+ link_data: dict[str, str] = {"href": href}
222
+
223
+ if include_text:
224
+ link_data["text"] = anchor.get_text(strip=True)
225
+
226
+ # Include title if present
227
+ title = anchor.get("title")
228
+ if title:
229
+ link_data["title"] = title
230
+
231
+ links.append(link_data)
232
+
233
+ return links
234
+
235
+
236
+ def extract_tables(
237
+ html: str,
238
+ include_headers: bool = True,
239
+ ) -> list[dict[str, Any]]:
240
+ """
241
+ Extract tables from HTML as structured data.
242
+
243
+ Args:
244
+ html: Raw HTML string
245
+ include_headers: Try to identify and include header rows
246
+
247
+ Returns:
248
+ List of table dictionaries with headers and rows
249
+ """
250
+ soup = parse_html(html)
251
+ tables: list[dict[str, Any]] = []
252
+
253
+ for table in soup.find_all("table"):
254
+ table_data: dict[str, Any] = {
255
+ "headers": [],
256
+ "rows": [],
257
+ }
258
+
259
+ # Extract headers from thead or first row
260
+ if include_headers:
261
+ thead = table.find("thead")
262
+ if thead:
263
+ header_row = thead.find("tr")
264
+ if header_row:
265
+ table_data["headers"] = [
266
+ th.get_text(strip=True)
267
+ for th in header_row.find_all(["th", "td"])
268
+ ]
269
+
270
+ # Extract body rows
271
+ tbody = table.find("tbody") or table
272
+ for row in tbody.find_all("tr"):
273
+ cells = row.find_all(["td", "th"])
274
+ row_data = [cell.get_text(strip=True) for cell in cells]
275
+
276
+ # If no headers yet and this looks like a header row
277
+ if include_headers and not table_data["headers"] and row.find("th"):
278
+ table_data["headers"] = row_data
279
+ else:
280
+ if row_data: # Skip empty rows
281
+ table_data["rows"].append(row_data)
282
+
283
+ if table_data["rows"] or table_data["headers"]:
284
+ tables.append(table_data)
285
+
286
+ return tables
backend/app/utils/logging.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Logging utilities for ScrapeRL backend."""
2
+
3
+ import logging
4
+ import sys
5
+ from typing import Optional
6
+
7
+
8
+ def setup_logging(
9
+ level: str = "INFO",
10
+ format_string: Optional[str] = None,
11
+ log_file: Optional[str] = None,
12
+ ) -> None:
13
+ """
14
+ Configure logging for the application.
15
+
16
+ Args:
17
+ level: Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
18
+ format_string: Custom format string for log messages
19
+ log_file: Optional file path to write logs to
20
+ """
21
+ if format_string is None:
22
+ format_string = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
23
+
24
+ log_level = getattr(logging, level.upper(), logging.INFO)
25
+
26
+ handlers: list[logging.Handler] = [logging.StreamHandler(sys.stdout)]
27
+
28
+ if log_file:
29
+ file_handler = logging.FileHandler(log_file)
30
+ handlers.append(file_handler)
31
+
32
+ logging.basicConfig(
33
+ level=log_level,
34
+ format=format_string,
35
+ handlers=handlers,
36
+ )
37
+
38
+ # Reduce noise from third-party libraries
39
+ logging.getLogger("httpx").setLevel(logging.WARNING)
40
+ logging.getLogger("httpcore").setLevel(logging.WARNING)
41
+ logging.getLogger("urllib3").setLevel(logging.WARNING)
42
+
43
+
44
+ def get_logger(name: str) -> logging.Logger:
45
+ """
46
+ Get a logger instance with the specified name.
47
+
48
+ Args:
49
+ name: Logger name, typically __name__ of the calling module
50
+
51
+ Returns:
52
+ Configured logger instance
53
+ """
54
+ return logging.getLogger(name)