petter2025 commited on
Commit
ee73001
·
verified ·
1 Parent(s): 17f934a

Delete agent_orchestrator.py

Browse files
Files changed (1) hide show
  1. agent_orchestrator.py +0 -461
agent_orchestrator.py DELETED
@@ -1,461 +0,0 @@
1
- import asyncio
2
- from typing import Dict, List, Any
3
- from dataclasses import dataclass
4
- from monitoring_models import AgentSpecialization
5
- from models import ReliabilityEvent, AnomalyResult
6
-
7
- @dataclass
8
- class AgentResult:
9
- specialization: AgentSpecialization
10
- confidence: float
11
- findings: Dict[str, Any]
12
- recommendations: List[str]
13
- processing_time: float
14
-
15
- class BaseAgent:
16
- def __init__(self, specialization: AgentSpecialization):
17
- self.specialization = specialization
18
- self.performance_metrics = {
19
- 'processed_events': 0,
20
- 'successful_analyses': 0,
21
- 'average_confidence': 0.0
22
- }
23
-
24
- async def analyze(self, event: ReliabilityEvent) -> AgentResult:
25
- """Base analysis method to be implemented by specialized agents"""
26
- raise NotImplementedError
27
-
28
- class AnomalyDetectionAgent(BaseAgent):
29
- def __init__(self):
30
- super().__init__(AgentSpecialization.DETECTIVE)
31
- self.adaptive_thresholds = {
32
- 'latency_p99': 150,
33
- 'error_rate': 0.05,
34
- 'cpu_util': 0.8,
35
- 'memory_util': 0.8
36
- }
37
-
38
- async def analyze(self, event: ReliabilityEvent) -> AgentResult:
39
- """Enhanced anomaly detection with pattern recognition"""
40
- start_time = asyncio.get_event_loop().time()
41
-
42
- # Multi-dimensional anomaly scoring
43
- anomaly_score = self._calculate_anomaly_score(event)
44
- pattern_match = self._detect_known_patterns(event)
45
-
46
- return AgentResult(
47
- specialization=self.specialization,
48
- confidence=anomaly_score,
49
- findings={
50
- 'anomaly_score': anomaly_score,
51
- 'detected_patterns': pattern_match,
52
- 'affected_metrics': self._identify_affected_metrics(event),
53
- 'severity_tier': self._classify_severity(anomaly_score)
54
- },
55
- recommendations=self._generate_detection_recommendations(event, anomaly_score),
56
- processing_time=asyncio.get_event_loop().time() - start_time
57
- )
58
-
59
- def _calculate_anomaly_score(self, event: ReliabilityEvent) -> float:
60
- """Calculate comprehensive anomaly score (0-1)"""
61
- scores = []
62
-
63
- # Latency anomaly (weighted 40%)
64
- if event.latency_p99 > self.adaptive_thresholds['latency_p99']:
65
- latency_score = min(1.0, (event.latency_p99 - self.adaptive_thresholds['latency_p99']) / 500)
66
- scores.append(0.4 * latency_score)
67
-
68
- # Error rate anomaly (weighted 30%)
69
- if event.error_rate > self.adaptive_thresholds['error_rate']:
70
- error_score = min(1.0, event.error_rate / 0.3)
71
- scores.append(0.3 * error_score)
72
-
73
- # Resource anomaly (weighted 30%)
74
- resource_score = 0
75
- if event.cpu_util and event.cpu_util > self.adaptive_thresholds['cpu_util']:
76
- resource_score += 0.15 * min(1.0, (event.cpu_util - self.adaptive_thresholds['cpu_util']) / 0.2)
77
- if event.memory_util and event.memory_util > self.adaptive_thresholds['memory_util']:
78
- resource_score += 0.15 * min(1.0, (event.memory_util - self.adaptive_thresholds['memory_util']) / 0.2)
79
- scores.append(resource_score)
80
-
81
- return min(1.0, sum(scores))
82
-
83
- def _detect_known_patterns(self, event: ReliabilityEvent) -> List[str]:
84
- """Detect known failure patterns"""
85
- patterns = []
86
-
87
- # Database timeout pattern
88
- if event.latency_p99 > 500 and event.error_rate > 0.2:
89
- patterns.append("database_timeout")
90
-
91
- # Resource exhaustion pattern
92
- if event.cpu_util and event.cpu_util > 0.9 and event.memory_util and event.memory_util > 0.9:
93
- patterns.append("resource_exhaustion")
94
-
95
- # Cascading failure pattern
96
- if event.error_rate > 0.15 and event.latency_p99 > 300:
97
- patterns.append("cascading_failure")
98
-
99
- # Traffic spike pattern
100
- if event.latency_p99 > 200 and event.throughput > 2000:
101
- patterns.append("traffic_spike")
102
-
103
- # Gradual degradation
104
- if 150 < event.latency_p99 < 300 and 0.05 < event.error_rate < 0.15:
105
- patterns.append("gradual_degradation")
106
-
107
- return patterns if patterns else ["unknown_pattern"]
108
-
109
- def _identify_affected_metrics(self, event: ReliabilityEvent) -> List[str]:
110
- """Identify which metrics are outside normal range"""
111
- affected = []
112
-
113
- if event.latency_p99 > self.adaptive_thresholds['latency_p99']:
114
- affected.append("latency")
115
-
116
- if event.error_rate > self.adaptive_thresholds['error_rate']:
117
- affected.append("error_rate")
118
-
119
- if event.cpu_util and event.cpu_util > self.adaptive_thresholds['cpu_util']:
120
- affected.append("cpu")
121
-
122
- if event.memory_util and event.memory_util > self.adaptive_thresholds['memory_util']:
123
- affected.append("memory")
124
-
125
- if event.throughput < 500: # Low throughput threshold
126
- affected.append("throughput")
127
-
128
- return affected if affected else ["none"]
129
-
130
- def _classify_severity(self, anomaly_score: float) -> str:
131
- """Classify severity based on anomaly score"""
132
- if anomaly_score > 0.8:
133
- return "CRITICAL"
134
- elif anomaly_score > 0.6:
135
- return "HIGH"
136
- elif anomaly_score > 0.4:
137
- return "MEDIUM"
138
- return "LOW"
139
-
140
- def _generate_detection_recommendations(self, event: ReliabilityEvent, anomaly_score: float) -> List[str]:
141
- """Generate actionable recommendations based on detected anomalies"""
142
- recommendations = []
143
-
144
- # Latency recommendations
145
- if event.latency_p99 > 500:
146
- recommendations.append("🚨 CRITICAL: Latency >500ms - Check database connections and external APIs immediately")
147
- elif event.latency_p99 > 300:
148
- recommendations.append("⚠️ HIGH: Latency >300ms - Investigate slow queries and service dependencies")
149
- elif event.latency_p99 > 150:
150
- recommendations.append("📈 Latency elevated - Monitor trends and consider optimization")
151
-
152
- # Error rate recommendations
153
- if event.error_rate > 0.3:
154
- recommendations.append("🚨 CRITICAL: Error rate >30% - Rollback recent deployments or enable circuit breaker")
155
- elif event.error_rate > 0.15:
156
- recommendations.append("⚠️ HIGH: Error rate >15% - Review application logs for exceptions")
157
- elif event.error_rate > 0.05:
158
- recommendations.append("📈 Errors increasing - Check for configuration issues")
159
-
160
- # Resource recommendations
161
- if event.cpu_util and event.cpu_util > 0.9:
162
- recommendations.append("🔥 CPU CRITICAL: >90% utilization - Scale horizontally or optimize hot paths")
163
- elif event.cpu_util and event.cpu_util > 0.8:
164
- recommendations.append("⚡ CPU HIGH: >80% utilization - Consider adding capacity")
165
-
166
- if event.memory_util and event.memory_util > 0.9:
167
- recommendations.append("💾 MEMORY CRITICAL: >90% utilization - Check for memory leaks")
168
- elif event.memory_util and event.memory_util > 0.8:
169
- recommendations.append("💾 MEMORY HIGH: >80% utilization - Monitor for leaks")
170
-
171
- # Overall severity recommendations
172
- if anomaly_score > 0.8:
173
- recommendations.append("🎯 IMMEDIATE ACTION REQUIRED: Multiple critical metrics affected")
174
- elif anomaly_score > 0.6:
175
- recommendations.append("🎯 INVESTIGATE: Significant performance degradation detected")
176
- elif anomaly_score > 0.4:
177
- recommendations.append("📊 MONITOR: Early warning signs detected")
178
-
179
- return recommendations[:5] # Return top 5 recommendations
180
-
181
- class RootCauseAgent(BaseAgent):
182
- def __init__(self):
183
- super().__init__(AgentSpecialization.DIAGNOSTICIAN)
184
- self.causal_patterns = self._load_causal_patterns()
185
-
186
- async def analyze(self, event: ReliabilityEvent) -> AgentResult:
187
- """AI-powered root cause analysis"""
188
- start_time = asyncio.get_event_loop().time()
189
-
190
- root_cause_analysis = self._perform_causal_analysis(event)
191
-
192
- return AgentResult(
193
- specialization=self.specialization,
194
- confidence=root_cause_analysis['confidence'],
195
- findings={
196
- 'likely_root_causes': root_cause_analysis['causes'],
197
- 'evidence_patterns': root_cause_analysis['evidence'],
198
- 'dependency_analysis': self._analyze_dependencies(event),
199
- 'timeline_correlation': self._check_temporal_patterns(event)
200
- },
201
- recommendations=root_cause_analysis['investigation_steps'],
202
- processing_time=asyncio.get_event_loop().time() - start_time
203
- )
204
-
205
- def _load_causal_patterns(self) -> Dict[str, Any]:
206
- """Load known causal patterns for root cause analysis"""
207
- return {
208
- 'high_latency_high_errors': {
209
- 'pattern': ['latency > 500', 'error_rate > 0.2'],
210
- 'cause': 'Database or external dependency failure',
211
- 'confidence': 0.85
212
- },
213
- 'high_cpu_high_memory': {
214
- 'pattern': ['cpu > 0.9', 'memory > 0.9'],
215
- 'cause': 'Resource exhaustion or memory leak',
216
- 'confidence': 0.90
217
- },
218
- 'high_errors_normal_latency': {
219
- 'pattern': ['error_rate > 0.3', 'latency < 200'],
220
- 'cause': 'Application bug or configuration issue',
221
- 'confidence': 0.75
222
- },
223
- 'gradual_degradation': {
224
- 'pattern': ['200 < latency < 400', '0.05 < error_rate < 0.15'],
225
- 'cause': 'Resource saturation or dependency degradation',
226
- 'confidence': 0.65
227
- }
228
- }
229
-
230
- def _perform_causal_analysis(self, event: ReliabilityEvent) -> Dict[str, Any]:
231
- """Analyze likely root causes based on event patterns"""
232
- causes = []
233
- evidence = []
234
- confidence = 0.5
235
-
236
- # Pattern 1: Database/External Dependency Failure
237
- if event.latency_p99 > 500 and event.error_rate > 0.2:
238
- causes.append({
239
- "cause": "Database/External Dependency Failure",
240
- "confidence": 0.85,
241
- "evidence": f"Extreme latency ({event.latency_p99:.0f}ms) with high errors ({event.error_rate*100:.1f}%)",
242
- "investigation": "Check database connection pool, external API health, network connectivity"
243
- })
244
- evidence.append("extreme_latency_with_errors")
245
- confidence = 0.85
246
-
247
- # Pattern 2: Resource Exhaustion
248
- if event.cpu_util and event.cpu_util > 0.9 and event.memory_util and event.memory_util > 0.9:
249
- causes.append({
250
- "cause": "Resource Exhaustion",
251
- "confidence": 0.90,
252
- "evidence": f"CPU ({event.cpu_util*100:.1f}%) and Memory ({event.memory_util*100:.1f}%) critically high",
253
- "investigation": "Check for memory leaks, infinite loops, insufficient resource allocation"
254
- })
255
- evidence.append("correlated_resource_exhaustion")
256
- confidence = max(confidence, 0.90)
257
-
258
- # Pattern 3: Application Bug / Configuration Issue
259
- if event.error_rate > 0.3 and event.latency_p99 < 200:
260
- causes.append({
261
- "cause": "Application Bug / Configuration Issue",
262
- "confidence": 0.75,
263
- "evidence": f"High error rate ({event.error_rate*100:.1f}%) without latency impact",
264
- "investigation": "Review recent deployments, configuration changes, application logs, and error traces"
265
- })
266
- evidence.append("errors_without_latency")
267
- confidence = max(confidence, 0.75)
268
-
269
- # Pattern 4: Gradual Performance Degradation
270
- if 200 <= event.latency_p99 <= 400 and 0.05 <= event.error_rate <= 0.15:
271
- causes.append({
272
- "cause": "Gradual Performance Degradation",
273
- "confidence": 0.65,
274
- "evidence": f"Moderate latency ({event.latency_p99:.0f}ms) and errors ({event.error_rate*100:.1f}%)",
275
- "investigation": "Check resource trends, dependency performance, capacity planning, and scaling policies"
276
- })
277
- evidence.append("gradual_degradation")
278
- confidence = max(confidence, 0.65)
279
-
280
- # Pattern 5: Traffic Spike
281
- if event.latency_p99 > 200 and event.throughput > 2000:
282
- causes.append({
283
- "cause": "Traffic Spike / Capacity Issue",
284
- "confidence": 0.70,
285
- "evidence": f"Elevated latency ({event.latency_p99:.0f}ms) with high throughput ({event.throughput:.0f} req/s)",
286
- "investigation": "Check autoscaling configuration, rate limiting, and load balancer health"
287
- })
288
- evidence.append("traffic_spike")
289
- confidence = max(confidence, 0.70)
290
-
291
- # Default: Unknown pattern
292
- if not causes:
293
- causes.append({
294
- "cause": "Unknown - Requires Investigation",
295
- "confidence": 0.3,
296
- "evidence": "Pattern does not match known failure modes",
297
- "investigation": "Complete system review needed - check logs, metrics, and recent changes"
298
- })
299
- evidence.append("unknown_pattern")
300
- confidence = 0.3
301
-
302
- # Generate investigation steps
303
- investigation_steps = [cause['investigation'] for cause in causes[:3]]
304
-
305
- return {
306
- 'confidence': confidence,
307
- 'causes': causes,
308
- 'evidence': evidence,
309
- 'investigation_steps': investigation_steps
310
- }
311
-
312
- def _analyze_dependencies(self, event: ReliabilityEvent) -> Dict[str, Any]:
313
- """Analyze dependency health and potential cascade effects"""
314
- analysis = {
315
- 'has_upstream_deps': len(event.upstream_deps) > 0,
316
- 'upstream_services': event.upstream_deps,
317
- 'potential_cascade': False,
318
- 'cascade_risk_score': 0.0
319
- }
320
-
321
- # Calculate cascade risk
322
- if event.error_rate > 0.2:
323
- analysis['potential_cascade'] = True
324
- analysis['cascade_risk_score'] = min(1.0, event.error_rate * 2)
325
-
326
- if event.latency_p99 > 500:
327
- analysis['potential_cascade'] = True
328
- analysis['cascade_risk_score'] = max(
329
- analysis['cascade_risk_score'],
330
- min(1.0, event.latency_p99 / 1000)
331
- )
332
-
333
- return analysis
334
-
335
- def _check_temporal_patterns(self, event: ReliabilityEvent) -> Dict[str, Any]:
336
- """Check for time-based correlations"""
337
- import datetime
338
-
339
- current_time = datetime.datetime.now()
340
- hour = current_time.hour
341
-
342
- # Check for typical patterns
343
- patterns = {
344
- 'time_of_day_correlation': False,
345
- 'is_peak_hours': 9 <= hour <= 17, # Business hours
346
- 'is_off_hours': hour < 6 or hour > 22,
347
- 'deployment_window': 14 <= hour <= 16, # Typical deployment window
348
- 'weekend': current_time.weekday() >= 5
349
- }
350
-
351
- # Flag potential correlations
352
- if patterns['is_peak_hours'] and event.latency_p99 > 200:
353
- patterns['time_of_day_correlation'] = True
354
-
355
- return patterns
356
-
357
- class OrchestrationManager:
358
- def __init__(self):
359
- self.agents = {
360
- AgentSpecialization.DETECTIVE: AnomalyDetectionAgent(),
361
- AgentSpecialization.DIAGNOSTICIAN: RootCauseAgent(),
362
- }
363
- self.incident_history = []
364
-
365
- async def orchestrate_analysis(self, event: ReliabilityEvent) -> Dict[str, Any]:
366
- """Coordinate multiple agents for comprehensive analysis"""
367
- agent_tasks = {
368
- spec: agent.analyze(event)
369
- for spec, agent in self.agents.items()
370
- }
371
-
372
- # Parallel agent execution with error handling
373
- agent_results = {}
374
- for specialization, task in agent_tasks.items():
375
- try:
376
- result = await asyncio.wait_for(task, timeout=10.0)
377
- agent_results[specialization.value] = result
378
- except asyncio.TimeoutError:
379
- # Agent timeout - continue with others
380
- print(f"Agent {specialization.value} timed out")
381
- continue
382
- except Exception as e:
383
- # Agent error - log and continue
384
- print(f"Agent {specialization.value} error: {e}")
385
- continue
386
-
387
- # Synthesize results
388
- return self._synthesize_agent_findings(event, agent_results)
389
-
390
- def _synthesize_agent_findings(self, event: ReliabilityEvent, agent_results: Dict) -> Dict[str, Any]:
391
- """Combine insights from all specialized agents"""
392
- detective_result = agent_results.get(AgentSpecialization.DETECTIVE.value)
393
- diagnostician_result = agent_results.get(AgentSpecialization.DIAGNOSTICIAN.value)
394
-
395
- if not detective_result:
396
- return {'error': 'No agent results available'}
397
-
398
- # Build comprehensive analysis
399
- synthesis = {
400
- 'incident_summary': {
401
- 'severity': detective_result.findings.get('severity_tier', 'UNKNOWN'),
402
- 'anomaly_confidence': detective_result.confidence,
403
- 'primary_metrics_affected': detective_result.findings.get('affected_metrics', [])
404
- },
405
- 'root_cause_insights': diagnostician_result.findings if diagnostician_result else {},
406
- 'recommended_actions': self._prioritize_actions(
407
- detective_result.recommendations,
408
- diagnostician_result.recommendations if diagnostician_result else []
409
- ),
410
- 'business_context': self._add_business_context(event, detective_result.confidence),
411
- 'agent_metadata': {
412
- 'participating_agents': list(agent_results.keys()),
413
- 'processing_times': {k: v.processing_time for k, v in agent_results.items()}
414
- }
415
- }
416
-
417
- return synthesis
418
-
419
- def _prioritize_actions(self, detection_actions: List[str], diagnosis_actions: List[str]) -> List[str]:
420
- """Combine and prioritize actions from multiple agents"""
421
- all_actions = []
422
-
423
- # Add critical actions first (those with 🚨)
424
- critical = [a for a in detection_actions + diagnosis_actions if '🚨' in a]
425
- all_actions.extend(critical)
426
-
427
- # Add high priority actions (those with ⚠️)
428
- high = [a for a in detection_actions + diagnosis_actions if '⚠️' in a and a not in all_actions]
429
- all_actions.extend(high)
430
-
431
- # Add remaining actions
432
- remaining = [a for a in detection_actions + diagnosis_actions if a not in all_actions]
433
- all_actions.extend(remaining)
434
-
435
- # Remove duplicates while preserving order
436
- seen = set()
437
- unique_actions = []
438
- for action in all_actions:
439
- if action not in seen:
440
- seen.add(action)
441
- unique_actions.append(action)
442
-
443
- return unique_actions[:5] # Return top 5 actions
444
-
445
- def _add_business_context(self, event: ReliabilityEvent, confidence: float) -> Dict[str, Any]:
446
- """Add business impact context to the analysis"""
447
- # Calculate business severity
448
- if confidence > 0.8:
449
- business_severity = "CRITICAL"
450
- elif confidence > 0.6:
451
- business_severity = "HIGH"
452
- elif confidence > 0.4:
453
- business_severity = "MEDIUM"
454
- else:
455
- business_severity = "LOW"
456
-
457
- return {
458
- 'business_severity': business_severity,
459
- 'estimated_impact': f"{confidence * 100:.0f}% confidence of incident",
460
- 'recommended_escalation': confidence > 0.7
461
- }