Spaces:
Runtime error
Runtime error
| """ | |
| Pythonic data models for ARF Demo - COMPLETE VERSION | |
| """ | |
| from dataclasses import dataclass, asdict | |
| from enum import Enum | |
| from typing import Dict, List, Optional, Any | |
| import datetime | |
| # Import from actual ARF OSS package | |
| try: | |
| from agentic_reliability_framework.arf_core.models.healing_intent import ( | |
| HealingIntent, | |
| create_scale_out_intent, | |
| create_rollback_intent, | |
| create_restart_intent | |
| ) | |
| from agentic_reliability_framework.arf_core.engine.simple_mcp_client import OSSMCPClient | |
| ARF_OSS_AVAILABLE = True | |
| except ImportError: | |
| ARF_OSS_AVAILABLE = False | |
| # Fallback mock classes for demo | |
| class HealingIntent: | |
| def __init__(self, **kwargs): | |
| self.intent_type = kwargs.get("intent_type", "scale_out") | |
| self.parameters = kwargs.get("parameters", {}) | |
| def to_dict(self): | |
| return { | |
| "intent_type": self.intent_type, | |
| "parameters": self.parameters, | |
| "created_at": datetime.datetime.now().isoformat() | |
| } | |
| def create_scale_out_intent(resource_type: str, scale_factor: float = 2.0): | |
| return HealingIntent( | |
| intent_type="scale_out", | |
| parameters={ | |
| "resource_type": resource_type, | |
| "scale_factor": scale_factor, | |
| "action": "Increase capacity" | |
| } | |
| ) | |
| class OSSMCPClient: | |
| def __init__(self): | |
| self.mode = "advisory" | |
| def analyze_incident(self, metrics: Dict, pattern: str = "") -> Dict: | |
| return { | |
| "status": "analysis_complete", | |
| "recommendations": [ | |
| "Increase resource allocation", | |
| "Implement monitoring", | |
| "Add circuit breakers", | |
| "Optimize configuration" | |
| ], | |
| "confidence": 0.92, | |
| "pattern_matched": pattern, | |
| "healing_intent": { | |
| "type": "scale_out", | |
| "requires_execution": True | |
| } | |
| } | |
| class IncidentSeverity(Enum): | |
| """Enum for incident severity levels""" | |
| LOW = "LOW" | |
| MEDIUM = "MEDIUM" | |
| HIGH = "HIGH" | |
| CRITICAL = "CRITICAL" | |
| class DemoMode(Enum): | |
| """Enum for demo modes""" | |
| QUICK = "quick" | |
| COMPREHENSIVE = "comprehensive" | |
| INVESTOR = "investor" | |
| class OSSAnalysis: | |
| """Structured OSS analysis results - using actual ARF""" | |
| status: str | |
| recommendations: List[str] | |
| estimated_time: str | |
| engineers_needed: str | |
| manual_effort: str | |
| confidence_score: float = 0.95 | |
| healing_intent: Optional[Dict] = None | |
| def to_dict(self) -> Dict: | |
| """Convert to dictionary, including healing intent if available""" | |
| data = asdict(self) | |
| if self.healing_intent: | |
| data["healing_intent"] = { | |
| "type": "HealingIntent", | |
| "recommendations": self.recommendations, | |
| "requires_execution": True | |
| } | |
| return data | |
| def from_arf_analysis(cls, arf_result: Dict, scenario_name: str) -> 'OSSAnalysis': | |
| """Create from actual ARF analysis result""" | |
| recommendations = arf_result.get("recommendations", [ | |
| "Increase resource allocation", | |
| "Implement monitoring", | |
| "Add circuit breakers", | |
| "Optimize configuration" | |
| ]) | |
| return cls( | |
| status="✅ ARF OSS Analysis Complete", | |
| recommendations=recommendations, | |
| estimated_time="45-90 minutes", | |
| engineers_needed="2-3 engineers", | |
| manual_effort="High", | |
| confidence_score=0.92, | |
| healing_intent={ | |
| "scenario": scenario_name, | |
| "actions": recommendations, | |
| "execution_required": True, | |
| "auto_execution": False # OSS is advisory only | |
| } | |
| ) | |
| class EnterpriseResults: | |
| """Structured enterprise execution results""" | |
| actions_completed: List[str] | |
| metrics_improvement: Dict[str, str] | |
| business_impact: Dict[str, Any] | |
| approval_required: bool = True | |
| execution_time: str = "" | |
| healing_intent_executed: bool = True | |
| def to_dict(self) -> Dict: | |
| data = asdict(self) | |
| data["arf_enterprise"] = { | |
| "execution_complete": True, | |
| "learning_applied": True, | |
| "audit_trail_created": True | |
| } | |
| return data | |
| class IncidentScenario: | |
| """Pythonic incident scenario model with ARF integration""" | |
| name: str | |
| severity: IncidentSeverity | |
| metrics: Dict[str, str] | |
| impact: Dict[str, str] | |
| arf_pattern: str = "" # ARF pattern name for RAG recall | |
| oss_analysis: Optional[OSSAnalysis] = None | |
| enterprise_results: Optional[EnterpriseResults] = None | |
| def to_dict(self) -> Dict: | |
| """Convert to dictionary for JSON serialization""" | |
| data = { | |
| "name": self.name, | |
| "severity": self.severity.value, | |
| "metrics": self.metrics, | |
| "impact": self.impact, | |
| "arf_oss_available": ARF_OSS_AVAILABLE | |
| } | |
| if self.oss_analysis: | |
| data["oss_analysis"] = self.oss_analysis.to_dict() | |
| if self.enterprise_results: | |
| data["enterprise_results"] = self.enterprise_results.to_dict() | |
| return data | |
| class DemoStep: | |
| """Demo step for presenter guidance""" | |
| title: str | |
| scenario: Optional[str] | |
| action: str | |
| message: str | |
| icon: str = "🎯" | |
| arf_integration: bool = False | |
| # =========================================== | |
| # INCIDENT DATABASE - ADD THIS CLASS | |
| # =========================================== | |
| class IncidentDatabase: | |
| """Database of incident scenarios for the demo""" | |
| def get_scenarios() -> Dict[str, IncidentScenario]: | |
| """Get all incident scenarios""" | |
| cache_miss = IncidentScenario( | |
| name="Cache Miss Storm", | |
| severity=IncidentSeverity.CRITICAL, | |
| metrics={ | |
| "Cache Hit Rate": "18.5% (Critical)", | |
| "Database Load": "92% (Overloaded)", | |
| "Response Time": "1850ms (Slow)", | |
| "Affected Users": "45,000", | |
| "Eviction Rate": "125/sec" | |
| }, | |
| impact={ | |
| "Revenue Loss": "$8,500/hour", | |
| "Page Load Time": "+300%", | |
| "Users Impacted": "45,000", | |
| "SLA Violation": "Yes", | |
| "Customer Satisfaction": "-40%" | |
| }, | |
| arf_pattern="cache_miss_storm", | |
| oss_analysis=OSSAnalysis( | |
| status="✅ Analysis Complete", | |
| recommendations=[ | |
| "Increase Redis cache memory allocation by 2x", | |
| "Implement cache warming strategy with predictive loading", | |
| "Optimize key patterns and implement TTL adjustments", | |
| "Add circuit breaker for graceful database fallback", | |
| "Deploy monitoring for cache hit rate trends" | |
| ], | |
| estimated_time="60-90 minutes", | |
| engineers_needed="2-3 SREs + 1 DBA", | |
| manual_effort="High", | |
| confidence_score=0.92, | |
| healing_intent={ | |
| "type": "scale_out", | |
| "resource": "cache", | |
| "scale_factor": 2.0 | |
| } | |
| ), | |
| enterprise_results=EnterpriseResults( | |
| actions_completed=[ | |
| "✅ Auto-scaled Redis cluster: 4GB → 8GB", | |
| "✅ Deployed intelligent cache warming service", | |
| "✅ Optimized 12 key patterns with ML recommendations", | |
| "✅ Implemented circuit breaker with 95% success rate", | |
| "✅ Validated recovery with automated testing" | |
| ], | |
| metrics_improvement={ | |
| "Cache Hit Rate": "18.5% → 72%", | |
| "Response Time": "1850ms → 450ms", | |
| "Database Load": "92% → 45%", | |
| "Throughput": "1250 → 2450 req/sec" | |
| }, | |
| business_impact={ | |
| "Recovery Time": "60 min → 12 min", | |
| "Cost Saved": "$7,200", | |
| "Users Impacted": "45,000 → 0", | |
| "Revenue Protected": "$1,700", | |
| "MTTR Improvement": "80% reduction" | |
| }, | |
| approval_required=True, | |
| execution_time="8 minutes" | |
| ) | |
| ) | |
| db_exhaustion = IncidentScenario( | |
| name="Database Connection Pool Exhaustion", | |
| severity=IncidentSeverity.HIGH, | |
| metrics={ | |
| "Active Connections": "98/100 (Critical)", | |
| "API Latency": "2450ms", | |
| "Error Rate": "15.2%", | |
| "Queue Depth": "1250", | |
| "Connection Wait Time": "45s" | |
| }, | |
| impact={ | |
| "Revenue Loss": "$4,200/hour", | |
| "Affected Services": "API Gateway, User Service, Payment Service", | |
| "SLA Violation": "Yes", | |
| "Partner Impact": "3 external APIs" | |
| }, | |
| arf_pattern="db_connection_exhaustion", | |
| oss_analysis=OSSAnalysis( | |
| status="✅ Analysis Complete", | |
| recommendations=[ | |
| "Increase connection pool size from 100 to 200", | |
| "Add connection timeout (30s)", | |
| "Implement leak detection", | |
| "Add connection health checks", | |
| "Optimize query patterns" | |
| ], | |
| estimated_time="45-60 minutes", | |
| engineers_needed="1-2 DBAs", | |
| manual_effort="Medium-High", | |
| confidence_score=0.88 | |
| ) | |
| ) | |
| memory_leak = IncidentScenario( | |
| name="Memory Leak in Production", | |
| severity=IncidentSeverity.HIGH, | |
| metrics={ | |
| "Memory Usage": "96% (Critical)", | |
| "GC Pause Time": "4500ms", | |
| "Error Rate": "28.5%", | |
| "Restart Frequency": "12/hour", | |
| "Heap Fragmentation": "42%" | |
| }, | |
| impact={ | |
| "Revenue Loss": "$5,500/hour", | |
| "Session Loss": "8,500 users", | |
| "Customer Impact": "High", | |
| "Support Tickets": "+300%" | |
| }, | |
| arf_pattern="memory_leak_java", | |
| oss_analysis=OSSAnalysis( | |
| status="✅ Analysis Complete", | |
| recommendations=[ | |
| "Increase JVM heap size from 4GB to 8GB", | |
| "Implement memory leak detection with profiling", | |
| "Add proactive health checks", | |
| "Schedule rolling restart with zero downtime", | |
| "Deploy memory monitoring dashboard" | |
| ], | |
| estimated_time="75-90 minutes", | |
| engineers_needed="2 Java SREs", | |
| manual_effort="High", | |
| confidence_score=0.85 | |
| ) | |
| ) | |
| api_rate_limit = IncidentScenario( | |
| name="API Rate Limit Exceeded", | |
| severity=IncidentSeverity.MEDIUM, | |
| metrics={ | |
| "429 Error Rate": "42.5%", | |
| "Successful Requests": "58.3%", | |
| "API Latency": "120ms", | |
| "Queue Depth": "1250", | |
| "Client Satisfaction": "65/100" | |
| }, | |
| impact={ | |
| "Revenue Loss": "$1,800/hour", | |
| "Affected Partners": "8", | |
| "Partner SLA Violations": "3", | |
| "Business Impact": "Medium" | |
| }, | |
| arf_pattern="api_rate_limit" | |
| ) | |
| return { | |
| "Cache Miss Storm": cache_miss, | |
| "Database Connection Pool Exhaustion": db_exhaustion, | |
| "Memory Leak in Production": memory_leak, | |
| "API Rate Limit Exceeded": api_rate_limit | |
| } |