"""
models.py
====================================

This file defines the core data structures ("contracts") used in the
PyDebug-Optimizer environment.

We use Pydantic (v2) for:
✅ Data validation (ensures agent outputs are correct format)
✅ Type safety (prevents runtime bugs)
✅ Serialization (easy JSON conversion for OpenEnv)

🧠 MDP CONNECTION:
------------------
In Reinforcement Learning (RL), environments are modeled as a Markov Decision Process (MDP):

    (S, A, R, T)

Where:
- S = State (Observation)
- A = Action (Agent decision)
- R = Reward (Feedback signal)
- T = Transition (handled in env.py)

This file defines:
- Observation → State (S)
- Action → Action (A)
- Reward → Reward (R)

These models enforce STRUCTURE on how the agent interacts with the environment.
"""

from typing import Dict, Literal
from pydantic import BaseModel, Field


# ============================================================
# 🧩 OBSERVATION MODEL (STATE)
# ============================================================

class Observation(BaseModel):
    """
    Observation = STATE (S) in the Markov Decision Process.

    This represents what the agent "sees" at each step.

    Why Pydantic?
    -------------
    - Ensures every observation always has required fields
    - Prevents missing or malformed data
    - Automatically validates types (e.g., strings only)

    Components:
    -----------
    code_snippet:
        The buggy Python code the agent must analyze and fix.

    error_feedback:
        Runtime errors, stack traces, or hints from previous execution.
        Helps the agent reason about what went wrong.

    task_description:
        Natural language explanation of the task.
        Example:
        "Fix the off-by-one error in this loop"
    """

    code_snippet: str = Field(..., description="Buggy Python code")
    error_feedback: str = Field(..., description="Execution error or logs")
    task_description: str = Field(..., description="Description of the task")


# ============================================================
# ⚙️ ACTION MODEL (AGENT DECISION)
# ============================================================

class Action(BaseModel):
    """
    Action = AGENT DECISION (A) in the MDP.

    This is the MOST IMPORTANT model in this project.

    It forces the agent to behave like a Senior AI Engineer by
    following a structured reasoning pipeline.

    Instead of just "fixing code", the agent must:
    1. Diagnose the problem
    2. Explain reasoning
    3. Fix the code
    4. Optimize performance

    Why this matters:
    -----------------
    - Encourages chain-of-thought reasoning
    - Makes evaluation interpretable
    - Prevents shallow guessing
    - Improves training signal for RL agents

    Fields:
    -------

    error_type:
        Classification of the bug.
        Restricted using Literal for strict validation.
        Allowed values:
        - "syntax"
        - "runtime"
        - "logical"

    error_justification:
        Explanation of WHY this error type was chosen.
        Example:
        "Missing colon after function definition causes SyntaxError"

    fixed_code:
        Corrected version of the buggy code.

    fix_justification:
        Explanation of how the fix resolves the issue.

    optimized_code:
        Improved version focusing on time complexity.
        Example:
        O(n^2) → O(n) using hash maps

    complexity_justification:
        Explanation of complexity improvement using Big-O notation.
    """

    error_type: Literal["syntax", "runtime", "logical"] = Field(
        ..., description="Type of error identified"
    )

    error_justification: str = Field(
        ..., description="Why this error type was chosen"
    )

    fixed_code: str = Field(
        ..., description="Corrected version of the code"
    )

    fix_justification: str = Field(
        ..., description="Explanation of the fix"
    )

    optimized_code: str = Field(
        ..., description="Optimized version of the code"
    )

    complexity_justification: str = Field(
        ..., description="Explanation of time complexity improvement"
    )


# ============================================================
# 🎯 REWARD MODEL (FEEDBACK SIGNAL)
# ============================================================

class Reward(BaseModel):
    """
    Reward = FEEDBACK (R) in the MDP.

    This tells the agent how well it performed.

    Why structured reward?
    ----------------------
    Instead of a single number, we track components:
    - Makes training more stable
    - Helps debugging agent behavior
    - Enables detailed evaluation

    value:
        Final scalar reward in range [0.0, 1.0]

    component_scores:
        Breakdown of reward into parts:
        Example:
        {
            "identification": 0.2,
            "repair": 0.2,
            "correctness": 0.2,
            "optimization": 0.3
        }

    MDP Insight:
    ------------
    The agent's goal is to maximize expected cumulative reward:

        max E[ Σ R_t ]

    By shaping reward into components, we guide learning more effectively.
    """

    value: float = Field(
        ..., ge=0.0, le=1.0, description="Total reward (0.0 to 1.0)"
    )

    component_scores: Dict[str, float] = Field(
        default_factory=dict,
        description="Breakdown of reward components"
    )