petter2025 commited on
Commit
17f934a
·
verified ·
1 Parent(s): 6686a2d

Delete ml_models.py

Browse files
Files changed (1) hide show
  1. ml_models.py +0 -526
ml_models.py DELETED
@@ -1,526 +0,0 @@
1
- """
2
- Machine Learning Models for Advanced Anomaly Detection
3
- Includes ensemble methods, causal inference, and adaptive thresholds
4
- """
5
-
6
- import numpy as np
7
- from typing import Tuple, Optional, Dict, List
8
- import logging
9
- import datetime
10
-
11
- # Try importing optional ML libraries
12
- try:
13
- from sklearn.ensemble import IsolationForest
14
- from sklearn.preprocessing import StandardScaler
15
- SKLEARN_AVAILABLE = True
16
- except ImportError:
17
- SKLEARN_AVAILABLE = False
18
- logging.warning("scikit-learn not available. Using fallback detection only.")
19
-
20
- try:
21
- import torch
22
- import torch.nn as nn
23
- PYTORCH_AVAILABLE = True
24
- except ImportError:
25
- PYTORCH_AVAILABLE = False
26
- logging.warning("PyTorch not available. LSTM detector disabled.")
27
-
28
- logger = logging.getLogger(__name__)
29
-
30
- # === LSTM Model (Optional - Only if PyTorch available) ===
31
-
32
- if PYTORCH_AVAILABLE:
33
- class LSTMAnomalyDetector(nn.Module):
34
- """
35
- LSTM-based anomaly detector for time-series analysis.
36
- Uses sequence-to-sequence learning to predict next values
37
- and flag anomalies based on prediction error.
38
- """
39
-
40
- def __init__(self, input_size: int = 5, hidden_size: int = 64, num_layers: int = 2):
41
- super(LSTMAnomalyDetector, self).__init__()
42
-
43
- self.hidden_size = hidden_size
44
- self.num_layers = num_layers
45
-
46
- # LSTM layers
47
- self.lstm = nn.LSTM(
48
- input_size=input_size,
49
- hidden_size=hidden_size,
50
- num_layers=num_layers,
51
- batch_first=True,
52
- dropout=0.2
53
- )
54
-
55
- # Fully connected layers
56
- self.fc1 = nn.Linear(hidden_size, 32)
57
- self.fc2 = nn.Linear(32, input_size)
58
- self.relu = nn.ReLU()
59
-
60
- def forward(self, x):
61
- """Forward pass through the network"""
62
- # LSTM forward pass
63
- lstm_out, _ = self.lstm(x)
64
-
65
- # Take last time step
66
- last_output = lstm_out[:, -1, :]
67
-
68
- # Fully connected layers
69
- out = self.relu(self.fc1(last_output))
70
- out = self.fc2(out)
71
-
72
- return out
73
- else:
74
- # Dummy class if PyTorch not available
75
- class LSTMAnomalyDetector:
76
- def __init__(self, *args, **kwargs):
77
- logger.warning("LSTM detector not available (PyTorch not installed)")
78
-
79
- # === Ensemble Anomaly Detector ===
80
-
81
- class EnsembleAnomalyDetector:
82
- """
83
- Ensemble of multiple anomaly detection algorithms for robust detection.
84
- Gracefully degrades if ML libraries aren't available.
85
- """
86
-
87
- def __init__(self):
88
- self.isolation_forest = None
89
- self.lstm_model = None
90
- self.scaler = None
91
- self.is_trained = False
92
- self.training_data = []
93
-
94
- # Initialize models if libraries are available
95
- if SKLEARN_AVAILABLE:
96
- try:
97
- self.isolation_forest = IsolationForest(
98
- contamination=0.1,
99
- random_state=42,
100
- n_estimators=100
101
- )
102
- self.scaler = StandardScaler()
103
- logger.info("Initialized Isolation Forest detector")
104
- except Exception as e:
105
- logger.error(f"Failed to initialize Isolation Forest: {e}")
106
-
107
- if PYTORCH_AVAILABLE:
108
- try:
109
- self.lstm_model = LSTMAnomalyDetector()
110
- logger.info("Initialized LSTM detector")
111
- except Exception as e:
112
- logger.error(f"Failed to initialize LSTM: {e}")
113
-
114
- logger.info(f"EnsembleAnomalyDetector initialized (sklearn={SKLEARN_AVAILABLE}, pytorch={PYTORCH_AVAILABLE})")
115
-
116
- def add_sample(self, features: np.ndarray) -> None:
117
- """
118
- Add training sample
119
-
120
- Args:
121
- features: numpy array of [latency, error_rate, cpu, memory, throughput]
122
- """
123
- if not isinstance(features, np.ndarray):
124
- features = np.array(features)
125
-
126
- self.training_data.append(features)
127
-
128
- # Auto-train when we have enough data
129
- if len(self.training_data) >= 100 and not self.is_trained:
130
- self.train()
131
-
132
- def train(self) -> None:
133
- """Train all available models in the ensemble"""
134
- if len(self.training_data) < 50:
135
- logger.warning(f"Insufficient data for training: {len(self.training_data)} samples (need 50+)")
136
- return
137
-
138
- try:
139
- X = np.array(self.training_data)
140
-
141
- # Train Isolation Forest if available
142
- if self.isolation_forest is not None and SKLEARN_AVAILABLE:
143
- self.isolation_forest.fit(X)
144
- logger.info(f"Trained Isolation Forest on {len(self.training_data)} samples")
145
-
146
- # Train LSTM if available (placeholder for now)
147
- if self.lstm_model is not None and PYTORCH_AVAILABLE:
148
- # TODO: Implement full LSTM training loop
149
- # For now, just scale the data
150
- if self.scaler is not None:
151
- X_scaled = self.scaler.fit_transform(X)
152
- logger.info("LSTM training not yet implemented (using fallback)")
153
-
154
- self.is_trained = True
155
- logger.info(f"✅ Ensemble trained on {len(self.training_data)} samples")
156
-
157
- except Exception as e:
158
- logger.error(f"Training failed: {e}", exc_info=True)
159
- self.is_trained = False
160
-
161
- def predict_anomaly(self, features: np.ndarray) -> Tuple[bool, float, Dict]:
162
- """
163
- Predict if features represent an anomaly
164
-
165
- Args:
166
- features: numpy array of [latency, error_rate, cpu, memory, throughput]
167
-
168
- Returns:
169
- Tuple of (is_anomaly: bool, confidence: float, explanation: dict)
170
- """
171
- if not isinstance(features, np.ndarray):
172
- features = np.array(features)
173
-
174
- # If not trained or no ML libraries, use fallback
175
- if not self.is_trained or not SKLEARN_AVAILABLE:
176
- return self._fallback_detection(features)
177
-
178
- try:
179
- # Isolation Forest prediction
180
- if_score = self.isolation_forest.score_samples(features.reshape(1, -1))[0]
181
- if_anomaly = self.isolation_forest.predict(features.reshape(1, -1))[0] == -1
182
-
183
- # LSTM prediction (placeholder for now)
184
- lstm_score = 0.5 # TODO: Implement actual LSTM prediction
185
-
186
- # Statistical tests
187
- stat_score = self._statistical_tests(features)
188
-
189
- # Ensemble voting (weighted average)
190
- confidence = np.mean([
191
- abs(if_score),
192
- lstm_score,
193
- stat_score
194
- ])
195
-
196
- is_anomaly = if_anomaly or confidence > 0.7
197
-
198
- explanation = {
199
- 'isolation_forest_score': float(if_score),
200
- 'isolation_forest_anomaly': bool(if_anomaly),
201
- 'lstm_reconstruction_error': float(lstm_score),
202
- 'statistical_score': float(stat_score),
203
- 'ensemble_confidence': float(confidence),
204
- 'primary_detector': 'isolation_forest' if if_anomaly else 'ensemble',
205
- 'models_used': ['isolation_forest', 'statistical']
206
- }
207
-
208
- return is_anomaly, confidence, explanation
209
-
210
- except Exception as e:
211
- logger.error(f"Prediction failed, using fallback: {e}", exc_info=True)
212
- return self._fallback_detection(features)
213
-
214
- def _statistical_tests(self, features: np.ndarray) -> float:
215
- """
216
- Perform statistical tests for anomaly detection using z-scores
217
-
218
- Args:
219
- features: Current feature values
220
-
221
- Returns:
222
- Anomaly probability (0-1)
223
- """
224
- if len(self.training_data) < 10:
225
- return 0.5
226
-
227
- try:
228
- # Calculate z-scores
229
- historical = np.array(self.training_data)
230
- mean = np.mean(historical, axis=0)
231
- std = np.std(historical, axis=0)
232
-
233
- # Avoid division by zero
234
- z_scores = np.abs((features - mean) / (std + 1e-8))
235
- max_z_score = np.max(z_scores)
236
-
237
- # Convert z-score to probability (3-sigma rule)
238
- # z > 3 is very anomalous
239
- anomaly_prob = min(1.0, max_z_score / 3.0)
240
-
241
- return anomaly_prob
242
-
243
- except Exception as e:
244
- logger.error(f"Statistical test failed: {e}")
245
- return 0.5
246
-
247
- def _fallback_detection(self, features: np.ndarray) -> Tuple[bool, float, Dict]:
248
- """
249
- Fallback detection when ML models aren't trained or available
250
- Uses simple threshold-based detection
251
-
252
- Args:
253
- features: [latency, error_rate, cpu, memory, throughput]
254
-
255
- Returns:
256
- Tuple of (is_anomaly, confidence, explanation)
257
- """
258
- latency_threshold = 150
259
- error_rate_threshold = 0.05
260
- cpu_threshold = 0.8
261
- memory_threshold = 0.8
262
-
263
- latency = features[0] if len(features) > 0 else 0
264
- error_rate = features[1] if len(features) > 1 else 0
265
- cpu = features[2] if len(features) > 2 else 0
266
- memory = features[3] if len(features) > 3 else 0
267
-
268
- is_anomaly = (
269
- latency > latency_threshold or
270
- error_rate > error_rate_threshold or
271
- cpu > cpu_threshold or
272
- memory > memory_threshold
273
- )
274
-
275
- confidence = 0.5 if is_anomaly else 0.1
276
-
277
- explanation = {
278
- 'method': 'fallback_threshold',
279
- 'latency_exceeded': latency > latency_threshold,
280
- 'error_rate_exceeded': error_rate > error_rate_threshold,
281
- 'cpu_exceeded': cpu > cpu_threshold,
282
- 'memory_exceeded': memory > memory_threshold
283
- }
284
-
285
- return is_anomaly, confidence, explanation
286
-
287
- # === Causal Inference Engine ===
288
-
289
- class CausalInferenceEngine:
290
- """
291
- Bayesian causal inference for root cause analysis.
292
- Uses probabilistic graphical models to infer causality.
293
- """
294
-
295
- def __init__(self):
296
- # Define causal relationships (cause -> effects)
297
- self.causal_graph = {
298
- 'database_latency': ['api_latency', 'error_rate'],
299
- 'network_issues': ['api_latency', 'timeout_errors'],
300
- 'memory_leak': ['memory_util', 'gc_time', 'response_time'],
301
- 'cpu_saturation': ['cpu_util', 'queue_length', 'latency'],
302
- 'traffic_spike': ['throughput', 'latency', 'error_rate']
303
- }
304
-
305
- # Prior probabilities for each root cause
306
- self.prior_probabilities = {
307
- 'database_latency': 0.3,
308
- 'network_issues': 0.2,
309
- 'memory_leak': 0.15,
310
- 'cpu_saturation': 0.2,
311
- 'traffic_spike': 0.15
312
- }
313
-
314
- logger.info("Initialized CausalInferenceEngine")
315
-
316
- def infer_root_cause(self, symptoms: Dict[str, float]) -> List[Tuple[str, float]]:
317
- """
318
- Use Bayesian inference to determine likely root causes
319
-
320
- Args:
321
- symptoms: Dictionary of observed symptoms and their values
322
- e.g., {'api_latency': 500, 'error_rate': 0.15, 'cpu_util': 0.9}
323
-
324
- Returns:
325
- List of (root_cause, probability) tuples sorted by probability
326
- """
327
- posterior_probs = {}
328
-
329
- for cause, effects in self.causal_graph.items():
330
- # Calculate likelihood P(symptoms|cause)
331
- likelihood = self._calculate_likelihood(symptoms, effects)
332
-
333
- # Calculate posterior P(cause|symptoms) ∝ P(symptoms|cause) * P(cause)
334
- prior = self.prior_probabilities[cause]
335
- posterior = likelihood * prior
336
-
337
- posterior_probs[cause] = posterior
338
-
339
- # Normalize probabilities
340
- total = sum(posterior_probs.values())
341
- if total > 0:
342
- posterior_probs = {k: v/total for k, v in posterior_probs.items()}
343
- else:
344
- # If all probabilities are 0, return uniform distribution
345
- posterior_probs = {k: 1.0/len(posterior_probs) for k in posterior_probs}
346
-
347
- # Sort by probability (descending)
348
- ranked_causes = sorted(
349
- posterior_probs.items(),
350
- key=lambda x: x[1],
351
- reverse=True
352
- )
353
-
354
- logger.info(f"Inferred root causes: {ranked_causes[:3]}")
355
-
356
- return ranked_causes
357
-
358
- def _calculate_likelihood(self, symptoms: Dict[str, float], effects: List[str]) -> float:
359
- """
360
- Calculate likelihood of symptoms given a cause
361
-
362
- Args:
363
- symptoms: Observed symptoms
364
- effects: Expected effects of the cause
365
-
366
- Returns:
367
- Likelihood score (0-1)
368
- """
369
- matching_effects = sum(1 for effect in effects if effect in symptoms)
370
-
371
- if matching_effects == 0:
372
- return 0.1 # Low but non-zero probability
373
-
374
- # Higher likelihood if more effects are observed
375
- likelihood = matching_effects / len(effects)
376
-
377
- return likelihood
378
-
379
- # === Adaptive Threshold Learner ===
380
-
381
- class AdaptiveThresholdLearner:
382
- """
383
- Online learning system that adapts thresholds based on historical patterns.
384
- Uses exponential moving averages and seasonality detection.
385
- """
386
-
387
- def __init__(self, window_size: int = 100):
388
- self.window_size = window_size
389
- self.historical_data: Dict[str, List[Dict]] = {}
390
- self.thresholds: Dict[str, Dict] = {}
391
- self.seasonality_patterns: Dict[str, Dict] = {}
392
-
393
- logger.info(f"Initialized AdaptiveThresholdLearner with window_size={window_size}")
394
-
395
- def update(self, metric: str, value: float, timestamp: datetime.datetime) -> None:
396
- """
397
- Update historical data with new metric value
398
-
399
- Args:
400
- metric: Metric name (e.g., 'latency', 'error_rate')
401
- value: Metric value
402
- timestamp: Timestamp of the measurement
403
- """
404
- if metric not in self.historical_data:
405
- self.historical_data[metric] = []
406
-
407
- self.historical_data[metric].append({
408
- 'value': value,
409
- 'timestamp': timestamp
410
- })
411
-
412
- # Keep only recent data
413
- if len(self.historical_data[metric]) > self.window_size:
414
- self.historical_data[metric].pop(0)
415
-
416
- # Update threshold
417
- self._update_threshold(metric)
418
-
419
- def _update_threshold(self, metric: str) -> None:
420
- """
421
- Calculate adaptive threshold using statistical methods
422
-
423
- Args:
424
- metric: Metric name
425
- """
426
- data = self.historical_data[metric]
427
- if len(data) < 10:
428
- return
429
-
430
- try:
431
- values = [d['value'] for d in data]
432
-
433
- # Calculate statistics
434
- mean = np.mean(values)
435
- std = np.std(values)
436
- percentile_90 = np.percentile(values, 90)
437
- percentile_95 = np.percentile(values, 95)
438
-
439
- # Detect seasonality
440
- hour_of_day = data[-1]['timestamp'].hour
441
- day_of_week = data[-1]['timestamp'].weekday()
442
-
443
- # Adjust threshold based on time
444
- time_multiplier = self._get_time_multiplier(hour_of_day, day_of_week)
445
-
446
- # Set adaptive threshold (mean + 2*std, adjusted for time)
447
- threshold = (mean + 2 * std) * time_multiplier
448
-
449
- self.thresholds[metric] = {
450
- 'value': threshold,
451
- 'mean': mean,
452
- 'std': std,
453
- 'p90': percentile_90,
454
- 'p95': percentile_95,
455
- 'last_updated': datetime.datetime.now(),
456
- 'time_multiplier': time_multiplier
457
- }
458
-
459
- logger.debug(f"Updated threshold for {metric}: {threshold:.2f}")
460
-
461
- except Exception as e:
462
- logger.error(f"Failed to update threshold for {metric}: {e}")
463
-
464
- def _get_time_multiplier(self, hour: int, day_of_week: int) -> float:
465
- """
466
- Adjust threshold based on time of day and day of week
467
-
468
- Args:
469
- hour: Hour of day (0-23)
470
- day_of_week: Day of week (0=Monday, 6=Sunday)
471
-
472
- Returns:
473
- Multiplier for threshold adjustment
474
- """
475
- # Business hours (9 AM - 5 PM) on weekdays: higher threshold
476
- if 9 <= hour <= 17 and day_of_week < 5:
477
- return 1.2
478
-
479
- # Off hours or weekends: lower threshold (more sensitive)
480
- return 0.8
481
-
482
- def get_threshold(self, metric: str) -> Optional[float]:
483
- """
484
- Get current adaptive threshold for metric
485
-
486
- Args:
487
- metric: Metric name
488
-
489
- Returns:
490
- Current threshold value or None if not available
491
- """
492
- if metric in self.thresholds:
493
- return self.thresholds[metric]['value']
494
- return None
495
-
496
- def get_statistics(self, metric: str) -> Optional[Dict]:
497
- """
498
- Get full statistics for a metric
499
-
500
- Args:
501
- metric: Metric name
502
-
503
- Returns:
504
- Dictionary of statistics or None
505
- """
506
- return self.thresholds.get(metric)
507
-
508
- # === Utility Functions ===
509
-
510
- def create_feature_vector(event) -> np.ndarray:
511
- """
512
- Convert ReliabilityEvent to feature vector for ML models
513
-
514
- Args:
515
- event: ReliabilityEvent object
516
-
517
- Returns:
518
- numpy array of [latency, error_rate, cpu, memory, throughput]
519
- """
520
- return np.array([
521
- event.latency_p99,
522
- event.error_rate,
523
- event.cpu_util if event.cpu_util is not None else 0.5,
524
- event.memory_util if event.memory_util is not None else 0.5,
525
- event.throughput
526
- ])