Spaces:

A-R-F
/

Agentic-Reliability-Framework-API

Running

App Files Files Community

petter2025 commited on Nov 25, 2025

Commit

17f934a

verified ·

1 Parent(s): 6686a2d

Delete ml_models.py

Browse files

Files changed (1) hide show

ml_models.py +0 -526

ml_models.py DELETED Viewed

@@ -1,526 +0,0 @@
-"""
-Machine Learning Models for Advanced Anomaly Detection
-Includes ensemble methods, causal inference, and adaptive thresholds
-"""
-import numpy as np
-from typing import Tuple, Optional, Dict, List
-import logging
-import datetime
-# Try importing optional ML libraries
-try:
-    from sklearn.ensemble import IsolationForest
-    from sklearn.preprocessing import StandardScaler
-    SKLEARN_AVAILABLE = True
-except ImportError:
-    SKLEARN_AVAILABLE = False
-    logging.warning("scikit-learn not available. Using fallback detection only.")
-try:
-    import torch
-    import torch.nn as nn
-    PYTORCH_AVAILABLE = True
-except ImportError:
-    PYTORCH_AVAILABLE = False
-    logging.warning("PyTorch not available. LSTM detector disabled.")
-logger = logging.getLogger(__name__)
-# === LSTM Model (Optional - Only if PyTorch available) ===
-if PYTORCH_AVAILABLE:
-    class LSTMAnomalyDetector(nn.Module):
-        """
-        LSTM-based anomaly detector for time-series analysis.
-        Uses sequence-to-sequence learning to predict next values
-        and flag anomalies based on prediction error.
-        """
-        def __init__(self, input_size: int = 5, hidden_size: int = 64, num_layers: int = 2):
-            super(LSTMAnomalyDetector, self).__init__()
-            self.hidden_size = hidden_size
-            self.num_layers = num_layers
-            # LSTM layers
-            self.lstm = nn.LSTM(
-                input_size=input_size,
-                hidden_size=hidden_size,
-                num_layers=num_layers,
-                batch_first=True,
-                dropout=0.2
-            )
-            # Fully connected layers
-            self.fc1 = nn.Linear(hidden_size, 32)
-            self.fc2 = nn.Linear(32, input_size)
-            self.relu = nn.ReLU()
-        def forward(self, x):
-            """Forward pass through the network"""
-            # LSTM forward pass
-            lstm_out, _ = self.lstm(x)
-            # Take last time step
-            last_output = lstm_out[:, -1, :]
-            # Fully connected layers
-            out = self.relu(self.fc1(last_output))
-            out = self.fc2(out)
-            return out
-else:
-    # Dummy class if PyTorch not available
-    class LSTMAnomalyDetector:
-        def __init__(self, *args, **kwargs):
-            logger.warning("LSTM detector not available (PyTorch not installed)")
-# === Ensemble Anomaly Detector ===
-class EnsembleAnomalyDetector:
-    """
-    Ensemble of multiple anomaly detection algorithms for robust detection.
-    Gracefully degrades if ML libraries aren't available.
-    """
-    def __init__(self):
-        self.isolation_forest = None
-        self.lstm_model = None
-        self.scaler = None
-        self.is_trained = False
-        self.training_data = []
-        # Initialize models if libraries are available
-        if SKLEARN_AVAILABLE:
-            try:
-                self.isolation_forest = IsolationForest(
-                    contamination=0.1,
-                    random_state=42,
-                    n_estimators=100
-                )
-                self.scaler = StandardScaler()
-                logger.info("Initialized Isolation Forest detector")
-            except Exception as e:
-                logger.error(f"Failed to initialize Isolation Forest: {e}")
-        if PYTORCH_AVAILABLE:
-            try:
-                self.lstm_model = LSTMAnomalyDetector()
-                logger.info("Initialized LSTM detector")
-            except Exception as e:
-                logger.error(f"Failed to initialize LSTM: {e}")
-        logger.info(f"EnsembleAnomalyDetector initialized (sklearn={SKLEARN_AVAILABLE}, pytorch={PYTORCH_AVAILABLE})")
-    def add_sample(self, features: np.ndarray) -> None:
-        """
-        Add training sample
-        Args:
-            features: numpy array of [latency, error_rate, cpu, memory, throughput]
-        """
-        if not isinstance(features, np.ndarray):
-            features = np.array(features)
-        self.training_data.append(features)
-        # Auto-train when we have enough data
-        if len(self.training_data) >= 100 and not self.is_trained:
-            self.train()
-    def train(self) -> None:
-        """Train all available models in the ensemble"""
-        if len(self.training_data) < 50:
-            logger.warning(f"Insufficient data for training: {len(self.training_data)} samples (need 50+)")
-            return
-        try:
-            X = np.array(self.training_data)
-            # Train Isolation Forest if available
-            if self.isolation_forest is not None and SKLEARN_AVAILABLE:
-                self.isolation_forest.fit(X)
-                logger.info(f"Trained Isolation Forest on {len(self.training_data)} samples")
-            # Train LSTM if available (placeholder for now)
-            if self.lstm_model is not None and PYTORCH_AVAILABLE:
-                # TODO: Implement full LSTM training loop
-                # For now, just scale the data
-                if self.scaler is not None:
-                    X_scaled = self.scaler.fit_transform(X)
-                logger.info("LSTM training not yet implemented (using fallback)")
-            self.is_trained = True
-            logger.info(f"✅ Ensemble trained on {len(self.training_data)} samples")
-        except Exception as e:
-            logger.error(f"Training failed: {e}", exc_info=True)
-            self.is_trained = False
-    def predict_anomaly(self, features: np.ndarray) -> Tuple[bool, float, Dict]:
-        """
-        Predict if features represent an anomaly
-        Args:
-            features: numpy array of [latency, error_rate, cpu, memory, throughput]
-        Returns:
-            Tuple of (is_anomaly: bool, confidence: float, explanation: dict)
-        """
-        if not isinstance(features, np.ndarray):
-            features = np.array(features)
-        # If not trained or no ML libraries, use fallback
-        if not self.is_trained or not SKLEARN_AVAILABLE:
-            return self._fallback_detection(features)
-        try:
-            # Isolation Forest prediction
-            if_score = self.isolation_forest.score_samples(features.reshape(1, -1))[0]
-            if_anomaly = self.isolation_forest.predict(features.reshape(1, -1))[0] == -1
-            # LSTM prediction (placeholder for now)
-            lstm_score = 0.5  # TODO: Implement actual LSTM prediction
-            # Statistical tests
-            stat_score = self._statistical_tests(features)
-            # Ensemble voting (weighted average)
-            confidence = np.mean([
-                abs(if_score),
-                lstm_score,
-                stat_score
-            ])
-            is_anomaly = if_anomaly or confidence > 0.7
-            explanation = {
-                'isolation_forest_score': float(if_score),
-                'isolation_forest_anomaly': bool(if_anomaly),
-                'lstm_reconstruction_error': float(lstm_score),
-                'statistical_score': float(stat_score),
-                'ensemble_confidence': float(confidence),
-                'primary_detector': 'isolation_forest' if if_anomaly else 'ensemble',
-                'models_used': ['isolation_forest', 'statistical']
-            }
-            return is_anomaly, confidence, explanation
-        except Exception as e:
-            logger.error(f"Prediction failed, using fallback: {e}", exc_info=True)
-            return self._fallback_detection(features)
-    def _statistical_tests(self, features: np.ndarray) -> float:
-        """
-        Perform statistical tests for anomaly detection using z-scores
-        Args:
-            features: Current feature values
-        Returns:
-            Anomaly probability (0-1)
-        """
-        if len(self.training_data) < 10:
-            return 0.5
-        try:
-            # Calculate z-scores
-            historical = np.array(self.training_data)
-            mean = np.mean(historical, axis=0)
-            std = np.std(historical, axis=0)
-            # Avoid division by zero
-            z_scores = np.abs((features - mean) / (std + 1e-8))
-            max_z_score = np.max(z_scores)
-            # Convert z-score to probability (3-sigma rule)
-            # z > 3 is very anomalous
-            anomaly_prob = min(1.0, max_z_score / 3.0)
-            return anomaly_prob
-        except Exception as e:
-            logger.error(f"Statistical test failed: {e}")
-            return 0.5
-    def _fallback_detection(self, features: np.ndarray) -> Tuple[bool, float, Dict]:
-        """
-        Fallback detection when ML models aren't trained or available
-        Uses simple threshold-based detection
-        Args:
-            features: [latency, error_rate, cpu, memory, throughput]
-        Returns:
-            Tuple of (is_anomaly, confidence, explanation)
-        """
-        latency_threshold = 150
-        error_rate_threshold = 0.05
-        cpu_threshold = 0.8
-        memory_threshold = 0.8
-        latency = features[0] if len(features) > 0 else 0
-        error_rate = features[1] if len(features) > 1 else 0
-        cpu = features[2] if len(features) > 2 else 0
-        memory = features[3] if len(features) > 3 else 0
-        is_anomaly = (
-            latency > latency_threshold or
-            error_rate > error_rate_threshold or
-            cpu > cpu_threshold or
-            memory > memory_threshold
-        )
-        confidence = 0.5 if is_anomaly else 0.1
-        explanation = {
-            'method': 'fallback_threshold',
-            'latency_exceeded': latency > latency_threshold,
-            'error_rate_exceeded': error_rate > error_rate_threshold,
-            'cpu_exceeded': cpu > cpu_threshold,
-            'memory_exceeded': memory > memory_threshold
-        }
-        return is_anomaly, confidence, explanation
-# === Causal Inference Engine ===
-class CausalInferenceEngine:
-    """
-    Bayesian causal inference for root cause analysis.
-    Uses probabilistic graphical models to infer causality.
-    """
-    def __init__(self):
-        # Define causal relationships (cause -> effects)
-        self.causal_graph = {
-            'database_latency': ['api_latency', 'error_rate'],
-            'network_issues': ['api_latency', 'timeout_errors'],
-            'memory_leak': ['memory_util', 'gc_time', 'response_time'],
-            'cpu_saturation': ['cpu_util', 'queue_length', 'latency'],
-            'traffic_spike': ['throughput', 'latency', 'error_rate']
-        }
-        # Prior probabilities for each root cause
-        self.prior_probabilities = {
-            'database_latency': 0.3,
-            'network_issues': 0.2,
-            'memory_leak': 0.15,
-            'cpu_saturation': 0.2,
-            'traffic_spike': 0.15
-        }
-        logger.info("Initialized CausalInferenceEngine")
-    def infer_root_cause(self, symptoms: Dict[str, float]) -> List[Tuple[str, float]]:
-        """
-        Use Bayesian inference to determine likely root causes
-        Args:
-            symptoms: Dictionary of observed symptoms and their values
-                      e.g., {'api_latency': 500, 'error_rate': 0.15, 'cpu_util': 0.9}
-        Returns:
-            List of (root_cause, probability) tuples sorted by probability
-        """
-        posterior_probs = {}
-        for cause, effects in self.causal_graph.items():
-            # Calculate likelihood P(symptoms|cause)
-            likelihood = self._calculate_likelihood(symptoms, effects)
-            # Calculate posterior P(cause|symptoms) ∝ P(symptoms|cause) * P(cause)
-            prior = self.prior_probabilities[cause]
-            posterior = likelihood * prior
-            posterior_probs[cause] = posterior
-        # Normalize probabilities
-        total = sum(posterior_probs.values())
-        if total > 0:
-            posterior_probs = {k: v/total for k, v in posterior_probs.items()}
-        else:
-            # If all probabilities are 0, return uniform distribution
-            posterior_probs = {k: 1.0/len(posterior_probs) for k in posterior_probs}
-        # Sort by probability (descending)
-        ranked_causes = sorted(
-            posterior_probs.items(),
-            key=lambda x: x[1],
-            reverse=True
-        )
-        logger.info(f"Inferred root causes: {ranked_causes[:3]}")
-        return ranked_causes
-    def _calculate_likelihood(self, symptoms: Dict[str, float], effects: List[str]) -> float:
-        """
-        Calculate likelihood of symptoms given a cause
-        Args:
-            symptoms: Observed symptoms
-            effects: Expected effects of the cause
-        Returns:
-            Likelihood score (0-1)
-        """
-        matching_effects = sum(1 for effect in effects if effect in symptoms)
-        if matching_effects == 0:
-            return 0.1  # Low but non-zero probability
-        # Higher likelihood if more effects are observed
-        likelihood = matching_effects / len(effects)
-        return likelihood
-# === Adaptive Threshold Learner ===
-class AdaptiveThresholdLearner:
-    """
-    Online learning system that adapts thresholds based on historical patterns.
-    Uses exponential moving averages and seasonality detection.
-    """
-    def __init__(self, window_size: int = 100):
-        self.window_size = window_size
-        self.historical_data: Dict[str, List[Dict]] = {}
-        self.thresholds: Dict[str, Dict] = {}
-        self.seasonality_patterns: Dict[str, Dict] = {}
-        logger.info(f"Initialized AdaptiveThresholdLearner with window_size={window_size}")
-    def update(self, metric: str, value: float, timestamp: datetime.datetime) -> None:
-        """
-        Update historical data with new metric value
-        Args:
-            metric: Metric name (e.g., 'latency', 'error_rate')
-            value: Metric value
-            timestamp: Timestamp of the measurement
-        """
-        if metric not in self.historical_data:
-            self.historical_data[metric] = []
-        self.historical_data[metric].append({
-            'value': value,
-            'timestamp': timestamp
-        })
-        # Keep only recent data
-        if len(self.historical_data[metric]) > self.window_size:
-            self.historical_data[metric].pop(0)
-        # Update threshold
-        self._update_threshold(metric)
-    def _update_threshold(self, metric: str) -> None:
-        """
-        Calculate adaptive threshold using statistical methods
-        Args:
-            metric: Metric name
-        """
-        data = self.historical_data[metric]
-        if len(data) < 10:
-            return
-        try:
-            values = [d['value'] for d in data]
-            # Calculate statistics
-            mean = np.mean(values)
-            std = np.std(values)
-            percentile_90 = np.percentile(values, 90)
-            percentile_95 = np.percentile(values, 95)
-            # Detect seasonality
-            hour_of_day = data[-1]['timestamp'].hour
-            day_of_week = data[-1]['timestamp'].weekday()
-            # Adjust threshold based on time
-            time_multiplier = self._get_time_multiplier(hour_of_day, day_of_week)
-            # Set adaptive threshold (mean + 2*std, adjusted for time)
-            threshold = (mean + 2 * std) * time_multiplier
-            self.thresholds[metric] = {
-                'value': threshold,
-                'mean': mean,
-                'std': std,
-                'p90': percentile_90,
-                'p95': percentile_95,
-                'last_updated': datetime.datetime.now(),
-                'time_multiplier': time_multiplier
-            }
-            logger.debug(f"Updated threshold for {metric}: {threshold:.2f}")
-        except Exception as e:
-            logger.error(f"Failed to update threshold for {metric}: {e}")
-    def _get_time_multiplier(self, hour: int, day_of_week: int) -> float:
-        """
-        Adjust threshold based on time of day and day of week
-        Args:
-            hour: Hour of day (0-23)
-            day_of_week: Day of week (0=Monday, 6=Sunday)
-        Returns:
-            Multiplier for threshold adjustment
-        """
-        # Business hours (9 AM - 5 PM) on weekdays: higher threshold
-        if 9 <= hour <= 17 and day_of_week < 5:
-            return 1.2
-        # Off hours or weekends: lower threshold (more sensitive)
-        return 0.8
-    def get_threshold(self, metric: str) -> Optional[float]:
-        """
-        Get current adaptive threshold for metric
-        Args:
-            metric: Metric name
-        Returns:
-            Current threshold value or None if not available
-        """
-        if metric in self.thresholds:
-            return self.thresholds[metric]['value']
-        return None
-    def get_statistics(self, metric: str) -> Optional[Dict]:
-        """
-        Get full statistics for a metric
-        Args:
-            metric: Metric name
-        Returns:
-            Dictionary of statistics or None
-        """
-        return self.thresholds.get(metric)
-# === Utility Functions ===
-def create_feature_vector(event) -> np.ndarray:
-    """
-    Convert ReliabilityEvent to feature vector for ML models
-    Args:
-        event: ReliabilityEvent object
-    Returns:
-        numpy array of [latency, error_rate, cpu, memory, throughput]
-    """
-    return np.array([
-        event.latency_p99,
-        event.error_rate,
-        event.cpu_util if event.cpu_util is not None else 0.5,
-        event.memory_util if event.memory_util is not None else 0.5,
-        event.throughput
-    ])