Spaces:

NeerajCodz
/

scrapeRL

Running

App Files Files Community

NeerajCodz commited on 27 days ago

Commit

ab65628

1 Parent(s): ff3e1be

feat: add core RL environment models (observation, action, reward, env)

Browse files

Files changed (12) hide show

backend/app/core/__init__.py +18 -0
backend/app/core/__pycache__/__init__.cpython-314.pyc +0 -0
backend/app/core/__pycache__/action.cpython-314.pyc +0 -0
backend/app/core/__pycache__/env.cpython-314.pyc +0 -0
backend/app/core/__pycache__/episode.cpython-314.pyc +0 -0
backend/app/core/__pycache__/observation.cpython-314.pyc +0 -0
backend/app/core/__pycache__/reward.cpython-314.pyc +0 -0
backend/app/core/action.py +347 -0
backend/app/core/env.py +497 -0
backend/app/core/episode.py +261 -0
backend/app/core/observation.py +228 -0
backend/app/core/reward.py +398 -0

backend/app/core/__init__.py ADDED Viewed

	@@ -0,0 +1,18 @@

+"""Core module - RL environment, observations, actions, and rewards."""
+from app.core.action import Action, ActionType
+from app.core.env import WebScraperEnv
+from app.core.episode import Episode, EpisodeStatus
+from app.core.observation import Observation
+from app.core.reward import RewardEngine, RewardBreakdown
+__all__ = [
+    "Action",
+    "ActionType",
+    "WebScraperEnv",
+    "Episode",
+    "EpisodeStatus",
+    "Observation",
+    "RewardEngine",
+    "RewardBreakdown",
+]

backend/app/core/__pycache__/__init__.cpython-314.pyc ADDED Viewed

Binary file (627 Bytes). View file

backend/app/core/__pycache__/action.cpython-314.pyc ADDED Viewed

Binary file (20.3 kB). View file

backend/app/core/__pycache__/env.cpython-314.pyc ADDED Viewed

Binary file (22.6 kB). View file

backend/app/core/__pycache__/episode.cpython-314.pyc ADDED Viewed

Binary file (16.1 kB). View file

backend/app/core/__pycache__/observation.cpython-314.pyc ADDED Viewed

Binary file (12.9 kB). View file

backend/app/core/__pycache__/reward.cpython-314.pyc ADDED Viewed

Binary file (19.3 kB). View file

backend/app/core/action.py ADDED Viewed

	@@ -0,0 +1,347 @@

+"""Action model for the RL environment."""
+from enum import Enum
+from typing import Any
+from pydantic import BaseModel, ConfigDict, Field, field_validator
+class ActionType(str, Enum):
+    """All possible action types in the environment."""
+    # Navigation actions
+    NAVIGATE = "navigate"
+    GO_BACK = "go_back"
+    GO_FORWARD = "go_forward"
+    REFRESH = "refresh"
+    # Interaction actions
+    CLICK = "click"
+    FILL = "fill"
+    SELECT = "select"
+    SCROLL = "scroll"
+    HOVER = "hover"
+    # Extraction actions
+    EXTRACT_FIELD = "extract_field"
+    EXTRACT_TABLE = "extract_table"
+    EXTRACT_LIST = "extract_list"
+    # Search actions
+    SEARCH_PAGE = "search_page"
+    SEARCH_ENGINE = "search_engine"
+    # Verification actions
+    VERIFY_FACT = "verify_fact"
+    VERIFY_FIELD = "verify_field"
+    # Memory actions
+    STORE_MEMORY = "store_memory"
+    RECALL_MEMORY = "recall_memory"
+    # Tool actions
+    MCP_TOOL_CALL = "mcp_tool_call"
+    # Planning actions
+    CREATE_PLAN = "create_plan"
+    UPDATE_PLAN = "update_plan"
+    # Communication actions
+    SEND_MESSAGE = "send_message"
+    # Control actions
+    WAIT = "wait"
+    DONE = "done"
+    FAIL = "fail"
+class NavigateParams(BaseModel):
+    """Parameters for navigation actions."""
+    url: str
+    wait_for: str | None = None
+    timeout_ms: int = 30000
+class ClickParams(BaseModel):
+    """Parameters for click actions."""
+    selector: str
+    button: str = "left"
+    click_count: int = 1
+    wait_after_ms: int = 500
+class FillParams(BaseModel):
+    """Parameters for form fill actions."""
+    selector: str
+    value: str
+    clear_first: bool = True
+class SelectParams(BaseModel):
+    """Parameters for select dropdown actions."""
+    selector: str
+    value: str | None = None
+    label: str | None = None
+    index: int | None = None
+class ScrollParams(BaseModel):
+    """Parameters for scroll actions."""
+    direction: str = "down"
+    amount: int | str = "page"
+    selector: str | None = None
+class ExtractFieldParams(BaseModel):
+    """Parameters for field extraction actions."""
+    field_name: str
+    selector: str | None = None
+    extraction_method: str = "text"
+    attribute: str | None = None
+    regex_pattern: str | None = None
+    post_process: str | None = None
+class ExtractTableParams(BaseModel):
+    """Parameters for table extraction actions."""
+    table_selector: str
+    headers: list[str] | None = None
+    row_selector: str | None = None
+    cell_selectors: dict[str, str] | None = None
+class ExtractListParams(BaseModel):
+    """Parameters for list extraction actions."""
+    container_selector: str
+    item_selector: str
+    field_selectors: dict[str, str]
+class SearchPageParams(BaseModel):
+    """Parameters for searching within the current page."""
+    query: str
+    search_type: str = "text"
+class SearchEngineParams(BaseModel):
+    """Parameters for search engine queries."""
+    query: str
+    engine: str = "google"
+    num_results: int = 10
+class VerifyFactParams(BaseModel):
+    """Parameters for fact verification."""
+    claim: str
+    sources: list[str] | None = None
+    confidence_threshold: float = 0.8
+class VerifyFieldParams(BaseModel):
+    """Parameters for field verification."""
+    field_name: str
+    expected_type: str | None = None
+    expected_format: str | None = None
+    validation_rules: list[str] = Field(default_factory=list)
+class MemoryParams(BaseModel):
+    """Parameters for memory operations."""
+    key: str
+    value: Any | None = None
+    memory_type: str = "working"
+    ttl_seconds: int | None = None
+class MCPToolCallParams(BaseModel):
+    """Parameters for MCP tool calls."""
+    tool_name: str
+    arguments: dict[str, Any] = Field(default_factory=dict)
+class PlanParams(BaseModel):
+    """Parameters for planning actions."""
+    plan_description: str | None = None
+    steps: list[dict[str, Any]] | None = None
+class MessageParams(BaseModel):
+    """Parameters for inter-agent messages."""
+    target_agent: str
+    message_type: str
+    content: dict[str, Any] = Field(default_factory=dict)
+class WaitParams(BaseModel):
+    """Parameters for wait actions."""
+    duration_ms: int = 1000
+    wait_for_selector: str | None = None
+    wait_for_navigation: bool = False
+class DoneParams(BaseModel):
+    """Parameters for completion."""
+    success: bool = True
+    message: str | None = None
+    final_result: dict[str, Any] | None = None
+class Action(BaseModel):
+    """
+    Represents an action to be taken in the environment.
+    An action consists of:
+    - action_type: The type of action
+    - parameters: Action-specific parameters
+    - reasoning: Why this action was chosen
+    - confidence: How confident the agent is
+    """
+    action_type: ActionType = Field(..., description="Type of action to execute")
+    parameters: dict[str, Any] = Field(
+        default_factory=dict,
+        description="Action-specific parameters",
+    )
+    reasoning: str | None = Field(
+        default=None,
+        description="Agent's reasoning for this action",
+    )
+    confidence: float = Field(
+        default=1.0,
+        ge=0.0,
+        le=1.0,
+        description="Confidence in this action (0-1)",
+    )
+    agent_id: str | None = Field(
+        default=None,
+        description="ID of the agent that produced this action",
+    )
+    plan_step: int | None = Field(
+        default=None,
+        description="Which step of the plan this corresponds to",
+    )
+    @field_validator("confidence")
+    @classmethod
+    def validate_confidence(cls, v: float) -> float:
+        """Ensure confidence is between 0 and 1."""
+        return max(0.0, min(1.0, v))
+    model_config = ConfigDict(
+        json_schema_extra={
+            "example": {
+                "action_type": "extract_field",
+                "parameters": {
+                    "field_name": "price",
+                    "selector": ".product-price",
+                    "extraction_method": "text",
+                },
+                "reasoning": "The price element is visible with class .product-price",
+                "confidence": 0.92,
+            }
+        }
+    )
+    @classmethod
+    def navigate(cls, url: str, **kwargs: Any) -> "Action":
+        """Create a navigate action."""
+        return cls(
+            action_type=ActionType.NAVIGATE,
+            parameters={"url": url, **kwargs},
+        )
+    @classmethod
+    def click(cls, selector: str, **kwargs: Any) -> "Action":
+        """Create a click action."""
+        return cls(
+            action_type=ActionType.CLICK,
+            parameters={"selector": selector, **kwargs},
+        )
+    @classmethod
+    def extract_field(
+        cls,
+        field_name: str,
+        selector: str | None = None,
+        **kwargs: Any,
+    ) -> "Action":
+        """Create an extract field action."""
+        return cls(
+            action_type=ActionType.EXTRACT_FIELD,
+            parameters={"field_name": field_name, "selector": selector, **kwargs},
+        )
+    @classmethod
+    def search_engine(cls, query: str, engine: str = "google", **kwargs: Any) -> "Action":
+        """Create a search engine action."""
+        return cls(
+            action_type=ActionType.SEARCH_ENGINE,
+            parameters={"query": query, "engine": engine, **kwargs},
+        )
+    @classmethod
+    def done(cls, success: bool = True, message: str | None = None) -> "Action":
+        """Create a done action."""
+        return cls(
+            action_type=ActionType.DONE,
+            parameters={"success": success, "message": message},
+        )
+    @classmethod
+    def wait(cls, duration_ms: int = 1000) -> "Action":
+        """Create a wait action."""
+        return cls(
+            action_type=ActionType.WAIT,
+            parameters={"duration_ms": duration_ms},
+        )
+    @classmethod
+    def mcp_tool_call(cls, tool_name: str, **arguments: Any) -> "Action":
+        """Create an MCP tool call action."""
+        return cls(
+            action_type=ActionType.MCP_TOOL_CALL,
+            parameters={"tool_name": tool_name, "arguments": arguments},
+        )
+    def get_param(self, key: str, default: Any = None) -> Any:
+        """Get a parameter value with optional default."""
+        return self.parameters.get(key, default)
+    def validate_params(self) -> list[str]:
+        """Validate parameters for this action type. Returns list of errors."""
+        errors = []
+        required_params = {
+            ActionType.NAVIGATE: ["url"],
+            ActionType.CLICK: ["selector"],
+            ActionType.FILL: ["selector", "value"],
+            ActionType.EXTRACT_FIELD: ["field_name"],
+            ActionType.SEARCH_ENGINE: ["query"],
+            ActionType.MCP_TOOL_CALL: ["tool_name"],
+            ActionType.SEND_MESSAGE: ["target_agent", "message_type"],
+        }
+        if self.action_type in required_params:
+            for param in required_params[self.action_type]:
+                if param not in self.parameters or self.parameters[param] is None:
+                    errors.append(f"Missing required parameter: {param}")
+        return errors

backend/app/core/env.py ADDED Viewed

	@@ -0,0 +1,497 @@

+"""Web scraper RL environment."""
+import logging
+import time
+from typing import Any
+from app.config import Settings, get_settings
+from app.core.action import Action, ActionType
+from app.core.episode import Episode, EpisodeManager
+from app.core.observation import (
+    AvailableAction,
+    ExtractedField,
+    MemoryContext,
+    Observation,
+    TaskContext,
+)
+from app.core.reward import RewardBreakdown, RewardEngine
+logger = logging.getLogger(__name__)
+class WebScraperEnv:
+    """
+    Reinforcement Learning environment for web scraping.
+    Follows the Gymnasium API pattern:
+    - reset(task_id, seed) -> observation, info
+    - step(action) -> observation, reward, terminated, truncated, info
+    - get_state() -> state dict
+    """
+    def __init__(
+        self,
+        episode_id: str,
+        settings: Settings | None = None,
+    ) -> None:
+        """
+        Initialize the environment.
+        Args:
+            episode_id: Unique identifier for this episode.
+            settings: Application settings.
+        """
+        self.episode_id = episode_id
+        self.settings = settings or get_settings()
+        self.reward_engine = RewardEngine(settings)
+        self.episode_manager = EpisodeManager()
+        # State
+        self._episode: Episode | None = None
+        self._current_observation: Observation | None = None
+        self._task_context: TaskContext | None = None
+        self._ground_truth: dict[str, Any] | None = None
+        # Browser state (placeholder - would use Playwright in production)
+        self._current_url: str | None = None
+        self._page_html: str | None = None
+        self._page_title: str | None = None
+        # Extraction state
+        self._extracted_fields: list[ExtractedField] = []
+        self._navigation_history: list[str] = []
+        # Timing
+        self._start_time: float | None = None
+    async def reset(
+        self,
+        task_id: str,
+        seed: int | None = None,
+        config: dict[str, Any] | None = None,
+    ) -> tuple[Observation, dict[str, Any]]:
+        """
+        Reset the environment for a new episode.
+        Args:
+            task_id: ID of the task to execute.
+            seed: Random seed for reproducibility.
+            config: Optional episode configuration.
+        Returns:
+            Tuple of (initial_observation, info_dict).
+        """
+        logger.info(f"Resetting environment for task {task_id}")
+        # Reset state
+        self.reward_engine.reset()
+        self._extracted_fields = []
+        self._navigation_history = []
+        self._start_time = time.time()
+        self._current_url = None
+        self._page_html = None
+        self._page_title = None
+        # Create episode
+        self._episode = self.episode_manager.create_episode(
+            episode_id=self.episode_id,
+            task_id=task_id,
+            max_steps=self.settings.max_steps_per_episode,
+            seed=seed,
+            config=config or {},
+        )
+        self._episode.start()
+        # Load task context
+        self._task_context = await self._load_task_context(task_id)
+        # Create initial observation
+        self._current_observation = self._create_observation()
+        info = {
+            "episode_id": self.episode_id,
+            "task_id": task_id,
+            "max_steps": self._episode.max_steps,
+            "target_fields": self._task_context.target_fields if self._task_context else [],
+        }
+        return self._current_observation, info
+    async def step(
+        self,
+        action: Action,
+    ) -> tuple[Observation, float, dict[str, float], bool, bool, dict[str, Any]]:
+        """
+        Execute an action and return the result.
+        Args:
+            action: The action to execute.
+        Returns:
+            Tuple of (observation, reward, reward_breakdown, terminated, truncated, info).
+        """
+        if self._episode is None or self._current_observation is None:
+            raise RuntimeError("Environment not reset. Call reset() first.")
+        if self._episode.is_terminal:
+            raise RuntimeError("Episode has already terminated.")
+        step_start = time.time()
+        prev_observation = self._current_observation
+        # Validate action
+        errors = action.validate_params()
+        if errors:
+            logger.warning(f"Invalid action parameters: {errors}")
+        # Execute action
+        action_result = await self._execute_action(action)
+        # Update observation
+        self._current_observation = self._create_observation()
+        if action_result.get("error"):
+            self._current_observation.last_action_error = action_result["error"]
+            self._current_observation.consecutive_errors = (
+                prev_observation.consecutive_errors + 1
+            )
+        else:
+            self._current_observation.consecutive_errors = 0
+        # Compute reward
+        reward, breakdown = self.reward_engine.compute_reward(
+            action=action,
+            prev_observation=prev_observation,
+            new_observation=self._current_observation,
+            ground_truth=self._ground_truth,
+            max_steps=self._episode.max_steps,
+        )
+        # Check termination
+        terminated = self._check_terminated(action)
+        truncated = self._check_truncated()
+        # Update episode
+        step_duration = (time.time() - step_start) * 1000
+        self._episode.add_step(
+            action_type=action.action_type.value,
+            action_params=action.parameters,
+            action_reasoning=action.reasoning,
+            reward=reward,
+            reward_breakdown=breakdown.to_dict(),
+            observation_summary={
+                "url": self._current_observation.current_url,
+                "progress": self._current_observation.extraction_progress,
+                "fields_extracted": len(self._current_observation.extracted_so_far),
+            },
+            error=action_result.get("error"),
+            duration_ms=step_duration,
+        )
+        # Handle terminal states
+        if terminated:
+            success = action.action_type == ActionType.DONE and action.get_param(
+                "success", True
+            )
+            self._episode.complete(
+                success=success,
+                extracted_data=self._current_observation.get_extraction_dict(),
+            )
+            # Add terminal reward
+            terminal_reward, terminal_breakdown = (
+                self.reward_engine.compute_terminal_reward(
+                    self._current_observation,
+                    success=success,
+                    ground_truth=self._ground_truth,
+                )
+            )
+            reward += terminal_reward
+            breakdown.total += terminal_reward
+        elif truncated:
+            self._episode.truncate()
+        info = {
+            "action_result": action_result,
+            "step_duration_ms": step_duration,
+            "episode_step": self._episode.current_step,
+        }
+        return (
+            self._current_observation,
+            reward,
+            breakdown.to_dict(),
+            terminated,
+            truncated,
+            info,
+        )
+    def get_state(self) -> dict[str, Any]:
+        """Get the current state of the environment."""
+        if self._episode is None:
+            return {
+                "episode_id": self.episode_id,
+                "status": "not_started",
+            }
+        return {
+            "episode_id": self.episode_id,
+            "task_id": self._episode.task_id,
+            "step_number": self._episode.current_step,
+            "current_url": self._current_url,
+            "is_terminal": self._episode.is_terminal,
+            "total_reward": self._episode.total_reward,
+            "extracted_data": (
+                self._current_observation.get_extraction_dict()
+                if self._current_observation
+                else {}
+            ),
+            "status": self._episode.status.value,
+        }
+    async def _load_task_context(self, task_id: str) -> TaskContext:
+        """Load task context from task repository."""
+        # In production, this would fetch from database
+        from app.api.routes.tasks import TASK_REPOSITORY
+        task = TASK_REPOSITORY.get(task_id)
+        if task:
+            return TaskContext(
+                task_id=task.id,
+                task_name=task.name,
+                task_type=task.task_type.value,
+                target_fields=[f.name for f in task.fields_to_extract],
+                required_fields=task.success_criteria.get("required_fields", []),
+                hints=task.hints,
+                success_criteria=task.success_criteria,
+            )
+        # Default context
+        return TaskContext(
+            task_id=task_id,
+            task_name=f"Task {task_id}",
+            task_type="unknown",
+            target_fields=[],
+            required_fields=[],
+        )
+    def _create_observation(self) -> Observation:
+        """Create an observation from current state."""
+        if self._episode is None:
+            raise RuntimeError("Episode not initialized")
+        elapsed = time.time() - (self._start_time or time.time())
+        # Get available actions
+        available_actions = self._get_available_actions()
+        # Calculate progress
+        target_fields = (
+            self._task_context.target_fields if self._task_context else []
+        )
+        extracted_names = {f.field_name for f in self._extracted_fields}
+        fields_remaining = [f for f in target_fields if f not in extracted_names]
+        progress = (
+            len(self._extracted_fields) / len(target_fields)
+            if target_fields
+            else 0.0
+        )
+        return Observation(
+            episode_id=self.episode_id,
+            task_id=self._episode.task_id,
+            step_number=self._episode.current_step,
+            elapsed_seconds=elapsed,
+            current_url=self._current_url,
+            page_title=self._page_title,
+            page_html=self._page_html,
+            navigation_history=self._navigation_history.copy(),
+            can_go_back=len(self._navigation_history) > 1,
+            task_context=self._task_context,
+            extracted_so_far=self._extracted_fields.copy(),
+            extraction_progress=progress,
+            fields_remaining=fields_remaining,
+            memory_context=MemoryContext(),
+            available_actions=available_actions,
+            tokens_used=self._episode.tokens_used,
+            api_calls_made=self._episode.api_calls,
+        )
+    def _get_available_actions(self) -> list[AvailableAction]:
+        """Get list of currently available actions."""
+        actions = []
+        # Navigation actions
+        actions.append(
+            AvailableAction(
+                action_type="navigate",
+                description="Navigate to a URL",
+                parameters={"url": "required"},
+            )
+        )
+        if self._current_url:
+            # Page interaction actions
+            actions.extend([
+                AvailableAction(
+                    action_type="click",
+                    description="Click on an element",
+                    parameters={"selector": "required"},
+                ),
+                AvailableAction(
+                    action_type="extract_field",
+                    description="Extract a field from the page",
+                    parameters={"field_name": "required", "selector": "optional"},
+                ),
+                AvailableAction(
+                    action_type="search_page",
+                    description="Search within the current page",
+                    parameters={"query": "required"},
+                ),
+            ])
+        # Always available
+        actions.extend([
+            AvailableAction(
+                action_type="search_engine",
+                description="Perform a web search",
+                parameters={"query": "required", "engine": "optional"},
+            ),
+            AvailableAction(
+                action_type="done",
+                description="Mark task as complete",
+                parameters={"success": "boolean"},
+            ),
+        ])
+        return actions
+    async def _execute_action(self, action: Action) -> dict[str, Any]:
+        """Execute an action and return the result."""
+        result: dict[str, Any] = {"success": False}
+        try:
+            match action.action_type:
+                case ActionType.NAVIGATE:
+                    result = await self._execute_navigate(action)
+                case ActionType.CLICK:
+                    result = await self._execute_click(action)
+                case ActionType.FILL:
+                    result = await self._execute_fill(action)
+                case ActionType.EXTRACT_FIELD:
+                    result = await self._execute_extract(action)
+                case ActionType.SEARCH_ENGINE:
+                    result = await self._execute_search_engine(action)
+                case ActionType.DONE:
+                    result = {"success": True, "done": True}
+                case ActionType.WAIT:
+                    await self._execute_wait(action)
+                    result = {"success": True}
+                case _:
+                    result = {
+                        "success": False,
+                        "error": f"Action type {action.action_type} not implemented",
+                    }
+        except Exception as e:
+            logger.error(f"Action execution failed: {e}")
+            result = {"success": False, "error": str(e)}
+        return result
+    async def _execute_navigate(self, action: Action) -> dict[str, Any]:
+        """Execute a navigate action."""
+        url = action.get_param("url")
+        if not url:
+            return {"success": False, "error": "URL is required"}
+        # Placeholder - in production would use Playwright
+        self._current_url = url
+        self._navigation_history.append(url)
+        self._page_title = f"Page at {url}"
+        self._page_html = f"<html><body><h1>Mock page for {url}</h1></body></html>"
+        return {"success": True, "url": url}
+    async def _execute_click(self, action: Action) -> dict[str, Any]:
+        """Execute a click action."""
+        selector = action.get_param("selector")
+        if not selector:
+            return {"success": False, "error": "Selector is required"}
+        # Placeholder
+        return {"success": True, "selector": selector, "clicked": True}
+    async def _execute_fill(self, action: Action) -> dict[str, Any]:
+        """Execute a fill action."""
+        selector = action.get_param("selector")
+        value = action.get_param("value")
+        if not selector or value is None:
+            return {"success": False, "error": "Selector and value are required"}
+        # Placeholder
+        return {"success": True, "selector": selector, "filled": True}
+    async def _execute_extract(self, action: Action) -> dict[str, Any]:
+        """Execute an extract action."""
+        field_name = action.get_param("field_name")
+        if not field_name:
+            return {"success": False, "error": "field_name is required"}
+        # Placeholder - in production would actually extract from page
+        extracted_field = ExtractedField(
+            field_name=field_name,
+            value=f"mock_value_for_{field_name}",
+            confidence=0.9,
+            source_selector=action.get_param("selector"),
+            extraction_step=self._episode.current_step if self._episode else 0,
+        )
+        self._extracted_fields.append(extracted_field)
+        return {
+            "success": True,
+            "field_name": field_name,
+            "value": extracted_field.value,
+            "confidence": extracted_field.confidence,
+        }
+    async def _execute_search_engine(self, action: Action) -> dict[str, Any]:
+        """Execute a search engine action."""
+        query = action.get_param("query")
+        if not query:
+            return {"success": False, "error": "Query is required"}
+        engine = action.get_param("engine", "google")
+        # Placeholder
+        return {
+            "success": True,
+            "query": query,
+            "engine": engine,
+            "results": [
+                {"title": f"Result 1 for {query}", "url": "https://example.com/1"},
+                {"title": f"Result 2 for {query}", "url": "https://example.com/2"},
+            ],
+        }
+    async def _execute_wait(self, action: Action) -> None:
+        """Execute a wait action."""
+        import asyncio
+        duration_ms = action.get_param("duration_ms", 1000)
+        await asyncio.sleep(duration_ms / 1000)
+    def _check_terminated(self, action: Action) -> bool:
+        """Check if the episode should terminate."""
+        if action.action_type == ActionType.DONE:
+            return True
+        if action.action_type == ActionType.FAIL:
+            return True
+        return False
+    def _check_truncated(self) -> bool:
+        """Check if the episode should be truncated."""
+        if self._episode is None:
+            return False
+        if self._episode.current_step >= self._episode.max_steps:
+            return True
+        return False

backend/app/core/episode.py ADDED Viewed

	@@ -0,0 +1,261 @@

+"""Episode state machine and management."""
+from datetime import datetime, timezone
+from enum import Enum
+from typing import Any
+from pydantic import BaseModel, Field
+class EpisodeStatus(str, Enum):
+    """Status of an episode."""
+    PENDING = "pending"
+    RUNNING = "running"
+    COMPLETED = "completed"
+    FAILED = "failed"
+    TRUNCATED = "truncated"
+    CANCELLED = "cancelled"
+class EpisodeStep(BaseModel):
+    """Record of a single step in the episode."""
+    step_number: int
+    timestamp: str
+    action_type: str
+    action_params: dict[str, Any]
+    action_reasoning: str | None = None
+    reward: float
+    reward_breakdown: dict[str, float]
+    observation_summary: dict[str, Any]
+    error: str | None = None
+    duration_ms: float = 0.0
+class Episode(BaseModel):
+    """
+    Represents a complete episode in the RL environment.
+    An episode is a sequence of steps from reset to termination,
+    tracking all actions, rewards, and observations.
+    """
+    # Identification
+    episode_id: str
+    task_id: str
+    # Timing
+    created_at: str = Field(
+        default_factory=lambda: datetime.now(timezone.utc).isoformat()
+    )
+    started_at: str | None = None
+    ended_at: str | None = None
+    # State
+    status: EpisodeStatus = EpisodeStatus.PENDING
+    current_step: int = 0
+    max_steps: int = 50
+    # Seed for reproducibility
+    seed: int | None = None
+    # Configuration
+    config: dict[str, Any] = Field(default_factory=dict)
+    # Step history
+    steps: list[EpisodeStep] = Field(default_factory=list)
+    # Aggregates
+    total_reward: float = 0.0
+    tokens_used: int = 0
+    api_calls: int = 0
+    estimated_cost_usd: float = 0.0
+    # Results
+    extracted_data: dict[str, Any] = Field(default_factory=dict)
+    final_accuracy: float | None = None
+    success: bool | None = None
+    failure_reason: str | None = None
+    # Navigation history
+    urls_visited: list[str] = Field(default_factory=list)
+    def start(self) -> None:
+        """Mark the episode as started."""
+        self.status = EpisodeStatus.RUNNING
+        self.started_at = datetime.now(timezone.utc).isoformat()
+    def add_step(
+        self,
+        action_type: str,
+        action_params: dict[str, Any],
+        reward: float,
+        reward_breakdown: dict[str, float],
+        observation_summary: dict[str, Any],
+        action_reasoning: str | None = None,
+        error: str | None = None,
+        duration_ms: float = 0.0,
+    ) -> EpisodeStep:
+        """Add a step to the episode."""
+        self.current_step += 1
+        step = EpisodeStep(
+            step_number=self.current_step,
+            timestamp=datetime.now(timezone.utc).isoformat(),
+            action_type=action_type,
+            action_params=action_params,
+            action_reasoning=action_reasoning,
+            reward=reward,
+            reward_breakdown=reward_breakdown,
+            observation_summary=observation_summary,
+            error=error,
+            duration_ms=duration_ms,
+        )
+        self.steps.append(step)
+        self.total_reward += reward
+        return step
+    def complete(
+        self,
+        success: bool,
+        extracted_data: dict[str, Any] | None = None,
+        final_accuracy: float | None = None,
+    ) -> None:
+        """Mark the episode as completed."""
+        self.status = EpisodeStatus.COMPLETED
+        self.ended_at = datetime.now(timezone.utc).isoformat()
+        self.success = success
+        if extracted_data:
+            self.extracted_data = extracted_data
+        self.final_accuracy = final_accuracy
+    def fail(self, reason: str) -> None:
+        """Mark the episode as failed."""
+        self.status = EpisodeStatus.FAILED
+        self.ended_at = datetime.now(timezone.utc).isoformat()
+        self.success = False
+        self.failure_reason = reason
+    def truncate(self, reason: str = "max_steps_reached") -> None:
+        """Mark the episode as truncated (stopped early)."""
+        self.status = EpisodeStatus.TRUNCATED
+        self.ended_at = datetime.now(timezone.utc).isoformat()
+        self.failure_reason = reason
+    def cancel(self) -> None:
+        """Mark the episode as cancelled."""
+        self.status = EpisodeStatus.CANCELLED
+        self.ended_at = datetime.now(timezone.utc).isoformat()
+    @property
+    def is_terminal(self) -> bool:
+        """Check if the episode has terminated."""
+        return self.status in [
+            EpisodeStatus.COMPLETED,
+            EpisodeStatus.FAILED,
+            EpisodeStatus.TRUNCATED,
+            EpisodeStatus.CANCELLED,
+        ]
+    @property
+    def duration_seconds(self) -> float | None:
+        """Get episode duration in seconds."""
+        if not self.started_at:
+            return None
+        end = self.ended_at or datetime.now(timezone.utc).isoformat()
+        start_dt = datetime.fromisoformat(self.started_at.replace("Z", "+00:00"))
+        end_dt = datetime.fromisoformat(end.replace("Z", "+00:00"))
+        return (end_dt - start_dt).total_seconds()
+    @property
+    def average_reward(self) -> float:
+        """Get average reward per step."""
+        if not self.steps:
+            return 0.0
+        return self.total_reward / len(self.steps)
+    def get_summary(self) -> dict[str, Any]:
+        """Get a summary of the episode."""
+        return {
+            "episode_id": self.episode_id,
+            "task_id": self.task_id,
+            "status": self.status.value,
+            "steps": self.current_step,
+            "total_reward": self.total_reward,
+            "average_reward": self.average_reward,
+            "duration_seconds": self.duration_seconds,
+            "tokens_used": self.tokens_used,
+            "estimated_cost_usd": self.estimated_cost_usd,
+            "success": self.success,
+            "fields_extracted": len(self.extracted_data),
+        }
+    def get_step_history(
+        self,
+        start: int = 0,
+        end: int | None = None,
+    ) -> list[EpisodeStep]:
+        """Get a slice of the step history."""
+        return self.steps[start:end]
+    def get_action_sequence(self) -> list[str]:
+        """Get the sequence of action types taken."""
+        return [step.action_type for step in self.steps]
+    def get_reward_history(self) -> list[float]:
+        """Get the sequence of rewards received."""
+        return [step.reward for step in self.steps]
+class EpisodeManager:
+    """Manager for episode lifecycle."""
+    def __init__(self) -> None:
+        """Initialize the episode manager."""
+        self._episodes: dict[str, Episode] = {}
+    def create_episode(
+        self,
+        episode_id: str,
+        task_id: str,
+        max_steps: int = 50,
+        seed: int | None = None,
+        config: dict[str, Any] | None = None,
+    ) -> Episode:
+        """Create a new episode."""
+        episode = Episode(
+            episode_id=episode_id,
+            task_id=task_id,
+            max_steps=max_steps,
+            seed=seed,
+            config=config or {},
+        )
+        self._episodes[episode_id] = episode
+        return episode
+    def get_episode(self, episode_id: str) -> Episode | None:
+        """Get an episode by ID."""
+        return self._episodes.get(episode_id)
+    def remove_episode(self, episode_id: str) -> bool:
+        """Remove an episode."""
+        if episode_id in self._episodes:
+            del self._episodes[episode_id]
+            return True
+        return False
+    def list_episodes(
+        self,
+        status: EpisodeStatus | None = None,
+        task_id: str | None = None,
+    ) -> list[Episode]:
+        """List episodes with optional filtering."""
+        episodes = list(self._episodes.values())
+        if status:
+            episodes = [e for e in episodes if e.status == status]
+        if task_id:
+            episodes = [e for e in episodes if e.task_id == task_id]
+        return episodes

backend/app/core/observation.py ADDED Viewed

	@@ -0,0 +1,228 @@

+"""Observation model for the RL environment."""
+from datetime import datetime, timezone
+from typing import Any
+from pydantic import BaseModel, ConfigDict, Field
+class ToolSnapshot(BaseModel):
+    """Snapshot of a tool from the registry."""
+    name: str
+    description: str
+    parameters: list[dict[str, Any]]
+    enabled: bool = True
+    cost_estimate: float = 0.0
+class MemoryContext(BaseModel):
+    """Context from memory systems."""
+    short_term: list[dict[str, Any]] = Field(default_factory=list)
+    working: list[dict[str, Any]] = Field(default_factory=list)
+    long_term_relevant: list[dict[str, Any]] = Field(default_factory=list)
+    shared: dict[str, Any] = Field(default_factory=dict)
+class PageElement(BaseModel):
+    """A significant element on the page."""
+    selector: str
+    tag: str
+    text: str | None = None
+    attributes: dict[str, str] = Field(default_factory=dict)
+    is_interactive: bool = False
+    is_visible: bool = True
+    bounding_box: dict[str, float] | None = None
+class ExtractedField(BaseModel):
+    """A field that has been extracted."""
+    field_name: str
+    value: Any
+    confidence: float = 1.0
+    source_selector: str | None = None
+    extraction_step: int = 0
+    verified: bool = False
+class AvailableAction(BaseModel):
+    """An action that is currently available."""
+    action_type: str
+    description: str
+    parameters: dict[str, Any] = Field(default_factory=dict)
+    estimated_reward: float | None = None
+    risk_level: str = "low"
+class TaskContext(BaseModel):
+    """Context about the current task."""
+    task_id: str
+    task_name: str
+    task_type: str
+    target_fields: list[str]
+    required_fields: list[str]
+    hints: list[str] = Field(default_factory=list)
+    success_criteria: dict[str, Any] = Field(default_factory=dict)
+class Observation(BaseModel):
+    """
+    Complete observation provided to the agent after each step.
+    Contains all information the agent needs to make decisions:
+    - Episode and task context
+    - Current page state
+    - Extracted data so far
+    - Memory context
+    - Available tools and actions
+    """
+    # Episode identification
+    episode_id: str = Field(..., description="Unique episode identifier")
+    task_id: str = Field(..., description="Task being executed")
+    step_number: int = Field(..., description="Current step in the episode")
+    # Timing
+    timestamp: str = Field(default_factory=lambda: datetime.now(timezone.utc).isoformat())
+    elapsed_seconds: float = Field(default=0.0, description="Time elapsed in episode")
+    # Page state
+    current_url: str | None = Field(default=None, description="Current page URL")
+    page_title: str | None = Field(default=None, description="Current page title")
+    page_html: str | None = Field(default=None, description="Full HTML of current page")
+    page_html_chunked: list[str] = Field(
+        default_factory=list,
+        description="HTML split into semantic chunks",
+    )
+    page_text: str | None = Field(default=None, description="Visible text content")
+    page_elements: list[PageElement] = Field(
+        default_factory=list,
+        description="Significant page elements",
+    )
+    # Navigation state
+    navigation_history: list[str] = Field(
+        default_factory=list,
+        description="URLs visited in this episode",
+    )
+    can_go_back: bool = Field(default=False)
+    can_go_forward: bool = Field(default=False)
+    # Task context
+    task_context: TaskContext | None = Field(
+        default=None,
+        description="Information about the current task",
+    )
+    # Extraction state
+    extracted_so_far: list[ExtractedField] = Field(
+        default_factory=list,
+        description="Fields extracted so far",
+    )
+    extraction_progress: float = Field(
+        default=0.0,
+        description="Progress towards task completion (0-1)",
+    )
+    fields_remaining: list[str] = Field(
+        default_factory=list,
+        description="Fields still to be extracted",
+    )
+    # Memory context
+    memory_context: MemoryContext = Field(
+        default_factory=MemoryContext,
+        description="Relevant memories from all layers",
+    )
+    # Tool registry snapshot
+    tool_registry_snapshot: list[ToolSnapshot] = Field(
+        default_factory=list,
+        description="Available tools and their state",
+    )
+    # Available actions
+    available_actions: list[AvailableAction] = Field(
+        default_factory=list,
+        description="Actions available in current state",
+    )
+    # Agent coordination
+    pending_messages: list[dict[str, Any]] = Field(
+        default_factory=list,
+        description="Messages from other agents",
+    )
+    active_plan: dict[str, Any] | None = Field(
+        default=None,
+        description="Current execution plan if any",
+    )
+    current_plan_step: int | None = Field(
+        default=None,
+        description="Current step in the plan",
+    )
+    # Error state
+    last_action_error: str | None = Field(
+        default=None,
+        description="Error from last action if any",
+    )
+    consecutive_errors: int = Field(
+        default=0,
+        description="Number of consecutive action errors",
+    )
+    # Cost tracking
+    tokens_used: int = Field(default=0, description="LLM tokens used so far")
+    api_calls_made: int = Field(default=0, description="API calls made")
+    estimated_cost_usd: float = Field(default=0.0, description="Estimated cost so far")
+    # Hints and guidance
+    system_hints: list[str] = Field(
+        default_factory=list,
+        description="Hints from the environment or previous steps",
+    )
+    model_config = ConfigDict(
+        json_schema_extra={
+            "example": {
+                "episode_id": "ep_abc123",
+                "task_id": "task_001",
+                "step_number": 5,
+                "current_url": "https://example.com/product/123",
+                "page_title": "Product Details - Example Store",
+                "extracted_so_far": [
+                    {
+                        "field_name": "product_name",
+                        "value": "Example Product",
+                        "confidence": 0.95,
+                    }
+                ],
+                "extraction_progress": 0.33,
+                "fields_remaining": ["price", "description"],
+            }
+        }
+    )
+    def get_extraction_dict(self) -> dict[str, Any]:
+        """Get extracted fields as a dictionary."""
+        return {field.field_name: field.value for field in self.extracted_so_far}
+    def is_field_extracted(self, field_name: str) -> bool:
+        """Check if a field has been extracted."""
+        return any(f.field_name == field_name for f in self.extracted_so_far)
+    def get_context_summary(self) -> str:
+        """Get a summary of the current context for LLM prompts."""
+        parts = [
+            f"Step {self.step_number}",
+            f"URL: {self.current_url or 'None'}",
+            f"Progress: {self.extraction_progress:.0%}",
+            f"Extracted: {len(self.extracted_so_far)}/{len(self.extracted_so_far) + len(self.fields_remaining)} fields",
+        ]
+        if self.last_action_error:
+            parts.append(f"Last error: {self.last_action_error}")
+        return " | ".join(parts)

backend/app/core/reward.py ADDED Viewed

	@@ -0,0 +1,398 @@

+"""Reward computation engine with component breakdown."""
+from dataclasses import dataclass, field
+from typing import Any
+from app.config import Settings, get_settings
+from app.core.action import Action, ActionType
+from app.core.observation import Observation
+@dataclass
+class RewardBreakdown:
+    """Detailed breakdown of reward components."""
+    # Core components
+    accuracy: float = 0.0
+    efficiency: float = 0.0
+    cost: float = 0.0
+    completeness: float = 0.0
+    # Bonus/penalty components
+    progress_bonus: float = 0.0
+    error_penalty: float = 0.0
+    time_penalty: float = 0.0
+    redundancy_penalty: float = 0.0
+    exploration_bonus: float = 0.0
+    verification_bonus: float = 0.0
+    # Metadata
+    total: float = 0.0
+    components: dict[str, float] = field(default_factory=dict)
+    def compute_total(self, weights: dict[str, float]) -> float:
+        """Compute total reward with weights."""
+        self.total = (
+            self.accuracy * weights.get("accuracy", 0.4)
+            + self.efficiency * weights.get("efficiency", 0.2)
+            + self.cost * weights.get("cost", 0.2)
+            + self.completeness * weights.get("completeness", 0.2)
+            + self.progress_bonus
+            + self.exploration_bonus
+            + self.verification_bonus
+            - self.error_penalty
+            - self.time_penalty
+            - self.redundancy_penalty
+        )
+        self.components = {
+            "accuracy": self.accuracy,
+            "efficiency": self.efficiency,
+            "cost": self.cost,
+            "completeness": self.completeness,
+            "progress_bonus": self.progress_bonus,
+            "error_penalty": self.error_penalty,
+            "time_penalty": self.time_penalty,
+            "redundancy_penalty": self.redundancy_penalty,
+            "exploration_bonus": self.exploration_bonus,
+            "verification_bonus": self.verification_bonus,
+        }
+        return self.total
+    def to_dict(self) -> dict[str, float]:
+        """Convert to dictionary."""
+        return {
+            "total": self.total,
+            **self.components,
+        }
+class RewardEngine:
+    """
+    Computes rewards for actions in the web scraping environment.
+    Reward components:
+    - Accuracy: How correct extracted data is
+    - Efficiency: Steps taken vs optimal
+    - Cost: API/compute costs
+    - Completeness: Progress towards task completion
+    Plus bonuses/penalties for:
+    - Progress: Making progress towards goal
+    - Errors: Failed actions or invalid extractions
+    - Time: Taking too long
+    - Redundancy: Repeating unsuccessful actions
+    - Exploration: Discovering new information
+    - Verification: Validating extracted data
+    """
+    def __init__(self, settings: Settings | None = None) -> None:
+        """Initialize the reward engine."""
+        self.settings = settings or get_settings()
+        self.weights = {
+            "accuracy": self.settings.reward_accuracy_weight,
+            "efficiency": self.settings.reward_efficiency_weight,
+            "cost": self.settings.reward_cost_weight,
+            "completeness": self.settings.reward_completeness_weight,
+        }
+        # Tracking for penalties
+        self._action_history: list[Action] = []
+        self._extraction_attempts: dict[str, int] = {}
+        self._url_visits: dict[str, int] = {}
+    def reset(self) -> None:
+        """Reset tracking state for a new episode."""
+        self._action_history.clear()
+        self._extraction_attempts.clear()
+        self._url_visits.clear()
+    def compute_reward(
+        self,
+        action: Action,
+        prev_observation: Observation,
+        new_observation: Observation,
+        ground_truth: dict[str, Any] | None = None,
+        max_steps: int = 50,
+    ) -> tuple[float, RewardBreakdown]:
+        """
+        Compute reward for an action.
+        Args:
+            action: The action that was taken.
+            prev_observation: Observation before the action.
+            new_observation: Observation after the action.
+            ground_truth: Optional ground truth data for accuracy calculation.
+            max_steps: Maximum steps allowed in episode.
+        Returns:
+            Tuple of (total_reward, breakdown).
+        """
+        breakdown = RewardBreakdown()
+        # Track action
+        self._action_history.append(action)
+        # Compute accuracy component
+        breakdown.accuracy = self._compute_accuracy(
+            action, new_observation, ground_truth
+        )
+        # Compute efficiency component
+        breakdown.efficiency = self._compute_efficiency(
+            new_observation.step_number, max_steps
+        )
+        # Compute cost component
+        breakdown.cost = self._compute_cost_reward(new_observation)
+        # Compute completeness component
+        breakdown.completeness = self._compute_completeness(
+            prev_observation, new_observation
+        )
+        # Compute bonuses
+        breakdown.progress_bonus = self._compute_progress_bonus(
+            prev_observation, new_observation
+        )
+        breakdown.exploration_bonus = self._compute_exploration_bonus(
+            action, new_observation
+        )
+        breakdown.verification_bonus = self._compute_verification_bonus(
+            action, new_observation
+        )
+        # Compute penalties
+        breakdown.error_penalty = self._compute_error_penalty(new_observation)
+        breakdown.time_penalty = self._compute_time_penalty(new_observation, max_steps)
+        breakdown.redundancy_penalty = self._compute_redundancy_penalty(action)
+        # Compute total
+        total = breakdown.compute_total(self.weights)
+        return total, breakdown
+    def _compute_accuracy(
+        self,
+        action: Action,
+        observation: Observation,
+        ground_truth: dict[str, Any] | None,
+    ) -> float:
+        """Compute accuracy reward component."""
+        if ground_truth is None:
+            # Without ground truth, use confidence scores
+            if observation.extracted_so_far:
+                avg_confidence = sum(
+                    f.confidence for f in observation.extracted_so_far
+                ) / len(observation.extracted_so_far)
+                return avg_confidence
+            return 0.5  # Neutral
+        # With ground truth, compute actual accuracy
+        extracted = observation.get_extraction_dict()
+        if not extracted:
+            return 0.0
+        correct = 0
+        total = 0
+        for field_name, expected_value in ground_truth.items():
+            if field_name in extracted:
+                total += 1
+                actual_value = extracted[field_name]
+                if self._values_match(actual_value, expected_value):
+                    correct += 1
+        if total == 0:
+            return 0.0
+        return correct / total
+    def _values_match(self, actual: Any, expected: Any) -> bool:
+        """Check if extracted value matches expected value."""
+        if actual == expected:
+            return True
+        # Fuzzy matching for strings
+        if isinstance(actual, str) and isinstance(expected, str):
+            actual_clean = actual.strip().lower()
+            expected_clean = expected.strip().lower()
+            if actual_clean == expected_clean:
+                return True
+            # Partial match
+            if expected_clean in actual_clean or actual_clean in expected_clean:
+                return True
+        # Numeric comparison with tolerance
+        if isinstance(actual, (int, float)) and isinstance(expected, (int, float)):
+            tolerance = abs(expected) * 0.01 if expected != 0 else 0.01
+            return abs(actual - expected) <= tolerance
+        return False
+    def _compute_efficiency(self, current_step: int, max_steps: int) -> float:
+        """Compute efficiency based on steps taken."""
+        # Higher reward for completing tasks in fewer steps
+        remaining_ratio = (max_steps - current_step) / max_steps
+        return max(0.0, remaining_ratio)
+    def _compute_cost_reward(self, observation: Observation) -> float:
+        """Compute reward based on cost efficiency."""
+        # Penalize high token usage and API calls
+        max_expected_tokens = 10000
+        max_expected_calls = 50
+        token_efficiency = 1.0 - min(
+            observation.tokens_used / max_expected_tokens, 1.0
+        )
+        call_efficiency = 1.0 - min(
+            observation.api_calls_made / max_expected_calls, 1.0
+        )
+        return (token_efficiency + call_efficiency) / 2
+    def _compute_completeness(
+        self,
+        prev_observation: Observation,
+        new_observation: Observation,
+    ) -> float:
+        """Compute completeness based on extraction progress."""
+        return new_observation.extraction_progress
+    def _compute_progress_bonus(
+        self,
+        prev_observation: Observation,
+        new_observation: Observation,
+    ) -> float:
+        """Bonus for making progress."""
+        progress_delta = (
+            new_observation.extraction_progress - prev_observation.extraction_progress
+        )
+        # Bonus for new extractions
+        new_extractions = len(new_observation.extracted_so_far) - len(
+            prev_observation.extracted_so_far
+        )
+        bonus = 0.0
+        if progress_delta > 0:
+            bonus += progress_delta * 0.5
+        if new_extractions > 0:
+            bonus += new_extractions * 0.1
+        return bonus
+    def _compute_exploration_bonus(
+        self,
+        action: Action,
+        observation: Observation,
+    ) -> float:
+        """Bonus for exploring new pages."""
+        bonus = 0.0
+        if action.action_type == ActionType.NAVIGATE:
+            url = action.get_param("url", "")
+            if url and url not in self._url_visits:
+                bonus += 0.05
+            self._url_visits[url] = self._url_visits.get(url, 0) + 1
+        return bonus
+    def _compute_verification_bonus(
+        self,
+        action: Action,
+        observation: Observation,
+    ) -> float:
+        """Bonus for verification actions."""
+        if action.action_type in [ActionType.VERIFY_FACT, ActionType.VERIFY_FIELD]:
+            return 0.05
+        return 0.0
+    def _compute_error_penalty(self, observation: Observation) -> float:
+        """Penalty for errors."""
+        if observation.last_action_error:
+            base_penalty = 0.1
+            consecutive_penalty = observation.consecutive_errors * 0.05
+            return base_penalty + consecutive_penalty
+        return 0.0
+    def _compute_time_penalty(
+        self,
+        observation: Observation,
+        max_steps: int,
+    ) -> float:
+        """Penalty for taking too long."""
+        step_ratio = observation.step_number / max_steps
+        if step_ratio > 0.8:
+            return (step_ratio - 0.8) * 0.5
+        return 0.0
+    def _compute_redundancy_penalty(self, action: Action) -> float:
+        """Penalty for redundant actions."""
+        if len(self._action_history) < 2:
+            return 0.0
+        # Check for repeated extract attempts on same field
+        if action.action_type == ActionType.EXTRACT_FIELD:
+            field = action.get_param("field_name", "")
+            attempts = self._extraction_attempts.get(field, 0)
+            self._extraction_attempts[field] = attempts + 1
+            if attempts > 0:
+                return min(attempts * 0.05, 0.2)
+        # Check for repeated navigation to same URL
+        if action.action_type == ActionType.NAVIGATE:
+            url = action.get_param("url", "")
+            visits = self._url_visits.get(url, 0)
+            if visits > 1:
+                return min((visits - 1) * 0.03, 0.15)
+        return 0.0
+    def compute_terminal_reward(
+        self,
+        observation: Observation,
+        success: bool,
+        ground_truth: dict[str, Any] | None = None,
+    ) -> tuple[float, RewardBreakdown]:
+        """
+        Compute final reward at episode termination.
+        Args:
+            observation: Final observation.
+            success: Whether the task was completed successfully.
+            ground_truth: Optional ground truth for accuracy.
+        Returns:
+            Tuple of (total_reward, breakdown).
+        """
+        breakdown = RewardBreakdown()
+        if success:
+            # Big bonus for successful completion
+            breakdown.completeness = 1.0
+            breakdown.progress_bonus = 0.5
+            # Compute final accuracy
+            if ground_truth:
+                extracted = observation.get_extraction_dict()
+                correct = sum(
+                    1 for k, v in ground_truth.items()
+                    if k in extracted and self._values_match(extracted[k], v)
+                )
+                total = len(ground_truth)
+                breakdown.accuracy = correct / total if total > 0 else 1.0
+            else:
+                breakdown.accuracy = observation.extraction_progress
+            # Efficiency bonus for fast completion
+            breakdown.efficiency = 1.0 - (
+                observation.step_number / self.settings.max_steps_per_episode
+            )
+        else:
+            # Partial credit for progress made
+            breakdown.completeness = observation.extraction_progress * 0.5
+            breakdown.error_penalty = 0.3
+        total = breakdown.compute_total(self.weights)
+        return total, breakdown