Arijit-07's picture
Add Task 7: Multi-Region Failover (partial failover with compliance constraints)
d59268c
from __future__ import annotations
from pydantic import BaseModel, Field
from typing import List, Optional, Dict, Any, Literal
from enum import Enum
class ActionType(str, Enum):
DIAGNOSE = "diagnose"
READ_LOGS = "read_logs"
READ_METRICS = "read_metrics"
READ_RUNBOOK = "read_runbook"
RESTART_SERVICE = "restart_service"
ROLLBACK = "rollback"
SCALE_UP = "scale_up"
ALERT_ONCALL = "alert_oncall"
ACKNOWLEDGE = "acknowledge"
NOOP = "noop"
SEARCH_LOGS = "search_logs"
BLOCK_IP_RANGE = "block_ip_range"
CREATE_INDEX = "create_index"
FAILOVER = "failover"
class Action(BaseModel):
action_type: ActionType
service: Optional[str] = None
root_cause: Optional[str] = None
runbook: Optional[str] = None
version: Optional[str] = None
reason: Optional[str] = None
query: Optional[str] = None # used with search_logs
ip_range: Optional[str] = None
table: Optional[str] = None
column: Optional[str] = None
target_region: Optional[str] = None
class Alert(BaseModel):
id: str
severity: Literal["critical", "warning", "info"]
service: str
message: str
timestamp: str
acknowledged: bool = False
class ServiceStatus(BaseModel):
name: str
status: Literal["healthy", "degraded", "down", "unknown"]
cpu_percent: float
memory_percent: float
error_rate: float
latency_p99_ms: float
replicas_running: int
replicas_desired: int
current_version: str
last_deployed: str
# SLA tracking β€” updated each step if unresolved
sla_breach: bool = False
minutes_degraded: int = 0
class ServiceDependency(BaseModel):
"""Describes which services call which β€” critical for cascade diagnosis."""
service: str
calls: List[str] # services this one depends on
called_by: List[str] # services that depend on this one
class EvidenceEntry(BaseModel):
"""One piece of gathered evidence β€” accumulated across steps."""
step: int
source: str # e.g. "logs:payment-service" or "metrics:inventory-service"
summary: str # short digest of what was found
raw: str # full content returned by read action
class Observation(BaseModel):
step: int
max_steps: int
task_id: str
task_description: str
services: List[ServiceStatus]
active_alerts: List[Alert]
recent_logs: Dict[str, List[str]]
available_runbooks: List[str]
# NEW: dependency topology so agent can reason about cascades
service_dependencies: List[ServiceDependency] = []
# NEW: accumulated evidence from all previous read actions
evidence_log: List[EvidenceEntry] = []
# NEW: SLA status β€” shows urgency
sla_status: Dict[str, str] = {} # service -> "ok" | "warning" | "breached"
last_action_result: Optional[str] = None
last_action_error: Optional[str] = None
incident_start_time: str
elapsed_minutes: int
class StepResult(BaseModel):
observation: Observation
reward: float
done: bool
info: Dict[str, Any] = {}
class State(BaseModel):
episode_id: str
task_id: str
step: int
current_observation: Observation
action_history: List[Dict[str, Any]]
total_reward: float
incident_resolved: bool
ground_truth_root_cause: str
ground_truth_fix: str
info: Dict[str, Any] = {}