Spaces:

A-R-F
/

Agentic-Reliability-Framework-v4

Running

App Files Files Community

petter2025 commited on 20 days ago

Commit

8773365

verified ·

1 Parent(s): f8082d8

Update app.py

Browse files

Files changed (1) hide show

app.py +178 -37

app.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import gradio as gr
 import json
 import logging
 import numpy as np
 import pandas as pd
 from datetime import datetime
@@ -10,32 +11,64 @@ import time
 import os
 import sqlite3
 import contextlib
 from scipy.stats import beta
 import plotly.graph_objects as go
 # ----------------------------------------------------------------------
-# Configuration from environment variables
 # ----------------------------------------------------------------------
 LOW_THRESHOLD = float(os.getenv("ARF_LOW_THRESHOLD", "0.2"))
 HIGH_THRESHOLD = float(os.getenv("ARF_HIGH_THRESHOLD", "0.8"))
 ALPHA_PRIOR = float(os.getenv("ARF_ALPHA_PRIOR", "1.0"))
 BETA_PRIOR = float(os.getenv("ARF_BETA_PRIOR", "1.0"))
 DB_PATH = os.getenv("ARF_DB_PATH", "/data/arf_decisions.db")
 # ----------------------------------------------------------------------
-# Logging
 # ----------------------------------------------------------------------
-logging.basicConfig(
-    level=logging.INFO,
-    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
-)
 logger = logging.getLogger(__name__)
 # ----------------------------------------------------------------------
-# SQLite persistence
 # ----------------------------------------------------------------------
 def init_db():
-    """Create the decisions table if it doesn't exist."""
     with contextlib.closing(sqlite3.connect(DB_PATH)) as conn:
         cursor = conn.cursor()
         cursor.execute('''
@@ -47,7 +80,9 @@ def init_db():
             )
         ''')
         conn.commit()
-    logger.info(f"Database initialized at {DB_PATH}")
 def save_decision_to_db(decision: dict, risk: float):
     """Insert a decision into the database."""
@@ -89,12 +124,21 @@ def vacuum_db():
     except Exception as e:
         logger.error(f"Vacuum failed: {e}")
 # ----------------------------------------------------------------------
 # Thread‑safe history (in‑memory + DB backup)
 # ----------------------------------------------------------------------
 decision_history = []
 risk_history = []
 history_lock = threading.Lock()
 def update_dashboard_data(decision: dict, risk: float):
     """Thread‑safe update of both in‑memory history and database."""
@@ -107,6 +151,9 @@ def update_dashboard_data(decision: dict, risk: float):
         if len(risk_history) > 100:
             risk_history.pop(0)
     save_decision_to_db(decision, risk)
 def refresh_history_from_db():
     """Load recent history from database (called at startup)."""
@@ -118,9 +165,10 @@ def refresh_history_from_db():
         for ts, dec, risk in decisions:
             decision_history.append((ts, dec, risk))
             risk_history.append((ts, risk))
 # ----------------------------------------------------------------------
-# Memory monitoring (daemon thread)
 # ----------------------------------------------------------------------
 def get_memory_usage():
     """Return current process memory usage in MB (RSS)."""
@@ -145,7 +193,7 @@ def get_memory_usage():
 def memory_monitor_loop():
     """Periodically log memory usage. Runs in a daemon thread."""
-    while True:
         try:
             mem_mb = get_memory_usage()
             if mem_mb is not None:
@@ -177,7 +225,7 @@ class BayesianRiskEngine:
         return lo, hi
 # ----------------------------------------------------------------------
-# Policy Engine (now configurable)
 # ----------------------------------------------------------------------
 class PolicyEngine:
     def __init__(self, thresholds: Dict[str, float] = None):
@@ -194,10 +242,17 @@ class PolicyEngine:
             return "escalate", f"Risk in escalation zone ({self.thresholds['low']}-{self.thresholds['high']})"
 # ----------------------------------------------------------------------
-# Infrastructure analysis (synchronous, with error handling)
 # ----------------------------------------------------------------------
 def handle_infra_with_governance(fault_type: str, context_window: int, session_state: dict):
     try:
         fault_map = {
             "none": (1, 99),
             "switch_down": (20, 80),
@@ -237,6 +292,8 @@ def handle_infra_with_governance(fault_type: str, context_window: int, session_s
                 "control_plane_decision": control_decision
             }
         }
         return output, session_state
     except Exception as e:
         logger.exception("Error in handle_infra_with_governance")
@@ -285,6 +342,7 @@ def run_hmc_mcmc(samples: int, warmup: int):
         # Input validation
         samples = max(500, min(10000, int(samples)))
         warmup = max(100, min(2000, int(warmup)))
         # Generate data: 10 observations with mean 0.5, std 0.2
         np.random.seed(42)  # for reproducibility
@@ -327,8 +385,34 @@ def run_hmc_mcmc(samples: int, warmup: int):
         return {"error": str(e)}, go.Figure(), go.Figure()
 # ----------------------------------------------------------------------
-# Dashboard plots (thread‑safe)
 # ----------------------------------------------------------------------
 def generate_risk_gauge():
     with history_lock:
         if not risk_history:
@@ -349,6 +433,7 @@ def generate_risk_gauge():
         }))
     return fig
 def generate_decision_pie():
     with history_lock:
         if not decision_history:
@@ -359,6 +444,7 @@ def generate_decision_pie():
     fig.update_layout(title="Policy Decisions")
     return fig
 def generate_action_timeline():
     with history_lock:
         if not decision_history:
@@ -394,18 +480,38 @@ def refresh_dashboard():
 # ----------------------------------------------------------------------
 oss_caps = {
     "edition": "OSS (Demo)",
-    "version": "4.0.0-bayesian",
     "license": "Apache 2.0",
     "execution": {"modes": ["advisory"], "max_incidents": 100},
     "memory": {"type": "in-memory", "faiss_index_type": "flat", "max_incident_nodes": 100},
-    "enterprise_features": ["Real-time HMC (using PyMC)", "Hyperpriors", "Decision Engine"]
 }
 # ----------------------------------------------------------------------
 # Startup
 # ----------------------------------------------------------------------
-# Ensure data directory exists
-os.makedirs(os.path.dirname(DB_PATH) if os.path.dirname(DB_PATH) else ".", exist_ok=True)
 init_db()
 refresh_history_from_db()
@@ -415,18 +521,19 @@ mem_thread.start()
 # Start periodic vacuum (once a day)
 def vacuum_scheduler():
-    while True:
         time.sleep(86400)  # 24 hours
-        vacuum_db()
 vacuum_thread = threading.Thread(target=vacuum_scheduler, daemon=True)
 vacuum_thread.start()
 # ----------------------------------------------------------------------
 # Gradio UI
 # ----------------------------------------------------------------------
-with gr.Blocks(title="ARF v4 – Bayesian Risk Scoring Demo") as demo:
     gr.Markdown(f"""
-    # 🧠 ARF v4 – Bayesian Risk Scoring for AI Reliability (Demo)
     **Mathematically rigorous risk estimation using conjugate priors and MCMC**
     This demo showcases:
     - **Bayesian conjugate prior (Beta-Binomial)** – online risk update from observed failures/successes.
@@ -503,23 +610,54 @@ with gr.Blocks(title="ARF v4 – Bayesian Risk Scoring Demo") as demo:
             ]
             gr.JSON(label="Active Policies", value=policies_json)
         with gr.TabItem("Enterprise / OSS"):
             gr.Markdown(f"""
-            ## 🚀 ARF {oss_caps['edition'].upper()} Edition
-            **Version:** {oss_caps['version']}
-            **License:** {oss_caps['license']}
-            ### OSS Capabilities (Demo)
-            - **Bayesian conjugate prior** – Beta-Binomial risk scoring
-            - **Policy thresholds** – configurable approve/escalate/deny
-            - **MCMC sampling** – Metropolis-Hastings (simulates HMC concepts)
-            - **In-memory storage** – no persistence
-            ### Enterprise Features (not included)
-            {chr(10).join('- ' + f for f in oss_caps['enterprise_features'])}
-            [📅 Book a Demo](https://calendly.com/petter2025us/30min) | [📧 Contact Sales](mailto:petter2025us@outlook.com)
             """)
     # Wire events
@@ -535,6 +673,9 @@ with gr.Blocks(title="ARF v4 – Bayesian Risk Scoring Demo") as demo:
         outputs=[hmc_summary, hmc_trace_plot, hmc_pair_plot]
     )
 if __name__ == "__main__":
     demo.queue()
     demo.launch(theme="soft", server_name="0.0.0.0", server_port=7860)

 import gradio as gr
 import json
 import logging
+import logging.handlers
 import numpy as np
 import pandas as pd
 from datetime import datetime
 import os
 import sqlite3
 import contextlib
+import signal
+import sys
+import functools
 from scipy.stats import beta
 import plotly.graph_objects as go
+from prometheus_client import Counter, Histogram, Gauge, generate_latest, CONTENT_TYPE_LATEST
 # ----------------------------------------------------------------------
+# Configuration from environment variables with validation
 # ----------------------------------------------------------------------
 LOW_THRESHOLD = float(os.getenv("ARF_LOW_THRESHOLD", "0.2"))
 HIGH_THRESHOLD = float(os.getenv("ARF_HIGH_THRESHOLD", "0.8"))
 ALPHA_PRIOR = float(os.getenv("ARF_ALPHA_PRIOR", "1.0"))
 BETA_PRIOR = float(os.getenv("ARF_BETA_PRIOR", "1.0"))
 DB_PATH = os.getenv("ARF_DB_PATH", "/data/arf_decisions.db")
+LOG_LEVEL = os.getenv("ARF_LOG_LEVEL", "INFO").upper()
+VERSION = "4.2.0+oss-enhanced"
+# Validate thresholds
+if not (0 <= LOW_THRESHOLD < HIGH_THRESHOLD <= 1):
+    logging.warning(f"Invalid thresholds: low={LOW_THRESHOLD}, high={HIGH_THRESHOLD}. Using defaults.")
+    LOW_THRESHOLD = 0.2
+    HIGH_THRESHOLD = 0.8
+# Validate priors
+if ALPHA_PRIOR <= 0 or BETA_PRIOR <= 0:
+    logging.warning(f"Invalid priors: alpha={ALPHA_PRIOR}, beta={BETA_PRIOR}. Using defaults.")
+    ALPHA_PRIOR = 1.0
+    BETA_PRIOR = 1.0
 # ----------------------------------------------------------------------
+# Logging setup (file rotation + console)
 # ----------------------------------------------------------------------
+os.makedirs("/var/log/arf", exist_ok=True)
 logger = logging.getLogger(__name__)
+logger.setLevel(getattr(logging, LOG_LEVEL, logging.INFO))
+# File handler with rotation
+file_handler = logging.handlers.RotatingFileHandler(
+    "/var/log/arf/app.log", maxBytes=10_485_760, backupCount=5
+)
+file_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
+# Console handler (for Docker logs)
+console_handler = logging.StreamHandler(sys.stdout)
+console_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
+logger.addHandler(file_handler)
+logger.addHandler(console_handler)
 # ----------------------------------------------------------------------
+# SQLite persistence with secure permissions
 # ----------------------------------------------------------------------
 def init_db():
+    """Create the decisions table with secure file permissions."""
+    db_dir = os.path.dirname(DB_PATH)
+    if db_dir and not os.path.exists(db_dir):
+        os.makedirs(db_dir, exist_ok=True)
     with contextlib.closing(sqlite3.connect(DB_PATH)) as conn:
         cursor = conn.cursor()
         cursor.execute('''
             )
         ''')
         conn.commit()
+    # Restrict permissions (owner read/write only)
+    os.chmod(DB_PATH, 0o600)
+    logger.info(f"Database initialized at {DB_PATH} with secure permissions")
 def save_decision_to_db(decision: dict, risk: float):
     """Insert a decision into the database."""
     except Exception as e:
         logger.error(f"Vacuum failed: {e}")
+# ----------------------------------------------------------------------
+# Prometheus metrics
+# ----------------------------------------------------------------------
+decisions_total = Counter('arf_decisions_total', 'Total decisions made', ['action'])
+risk_gauge = Gauge('arf_current_risk', 'Current risk score')
+decision_latency = Histogram('arf_decision_latency_seconds', 'Time to evaluate intent')
+mcmc_runs = Counter('arf_mcmc_runs_total', 'Total MCMC runs')
 # ----------------------------------------------------------------------
 # Thread‑safe history (in‑memory + DB backup)
 # ----------------------------------------------------------------------
 decision_history = []
 risk_history = []
 history_lock = threading.Lock()
+shutdown_event = threading.Event()
 def update_dashboard_data(decision: dict, risk: float):
     """Thread‑safe update of both in‑memory history and database."""
         if len(risk_history) > 100:
             risk_history.pop(0)
     save_decision_to_db(decision, risk)
+    # Update Prometheus metrics
+    decisions_total.labels(action=decision.get("risk_level", "unknown")).inc()
+    risk_gauge.set(risk)
 def refresh_history_from_db():
     """Load recent history from database (called at startup)."""
         for ts, dec, risk in decisions:
             decision_history.append((ts, dec, risk))
             risk_history.append((ts, risk))
+            risk_gauge.set(risk)  # update gauge with latest risk
 # ----------------------------------------------------------------------
+# Memory monitoring (daemon thread with graceful stop)
 # ----------------------------------------------------------------------
 def get_memory_usage():
     """Return current process memory usage in MB (RSS)."""
 def memory_monitor_loop():
     """Periodically log memory usage. Runs in a daemon thread."""
+    while not shutdown_event.is_set():
         try:
             mem_mb = get_memory_usage()
             if mem_mb is not None:
         return lo, hi
 # ----------------------------------------------------------------------
+# Policy Engine
 # ----------------------------------------------------------------------
 class PolicyEngine:
     def __init__(self, thresholds: Dict[str, float] = None):
             return "escalate", f"Risk in escalation zone ({self.thresholds['low']}-{self.thresholds['high']})"
 # ----------------------------------------------------------------------
+# Infrastructure analysis (synchronous, with validation)
 # ----------------------------------------------------------------------
 def handle_infra_with_governance(fault_type: str, context_window: int, session_state: dict):
+    start_time = time.time()
     try:
+        # Input validation
+        fault_type = fault_type.strip()
+        if fault_type not in ["none", "switch_down", "server_overload", "cascade"]:
+            fault_type = "none"
+        context_window = max(0, min(1000, int(context_window)))  # clamp
         fault_map = {
             "none": (1, 99),
             "switch_down": (20, 80),
                 "control_plane_decision": control_decision
             }
         }
+        # Record latency metric
+        decision_latency.observe(time.time() - start_time)
         return output, session_state
     except Exception as e:
         logger.exception("Error in handle_infra_with_governance")
         # Input validation
         samples = max(500, min(10000, int(samples)))
         warmup = max(100, min(2000, int(warmup)))
+        mcmc_runs.inc()  # record metric
         # Generate data: 10 observations with mean 0.5, std 0.2
         np.random.seed(42)  # for reproducibility
         return {"error": str(e)}, go.Figure(), go.Figure()
 # ----------------------------------------------------------------------
+# Dashboard plots (thread‑safe with caching)
 # ----------------------------------------------------------------------
+# Simple TTL cache decorator
+class TTLCache:
+    def __init__(self, ttl_seconds=5):
+        self.ttl = ttl_seconds
+        self.cache = {}
+        self.lock = threading.Lock()
+    def __call__(self, func):
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            key = (func.__name__, args, frozenset(kwargs.items()))
+            now = time.time()
+            with self.lock:
+                if key in self.cache:
+                    result, timestamp = self.cache[key]
+                    if now - timestamp < self.ttl:
+                        return result
+            result = func(*args, **kwargs)
+            with self.lock:
+                self.cache[key] = (result, now)
+            return result
+        return wrapper
+dashboard_cache = TTLCache(ttl_seconds=2)  # cache for 2 seconds
+@dashboard_cache
 def generate_risk_gauge():
     with history_lock:
         if not risk_history:
         }))
     return fig
+@dashboard_cache
 def generate_decision_pie():
     with history_lock:
         if not decision_history:
     fig.update_layout(title="Policy Decisions")
     return fig
+@dashboard_cache
 def generate_action_timeline():
     with history_lock:
         if not decision_history:
 # ----------------------------------------------------------------------
 oss_caps = {
     "edition": "OSS (Demo)",
+    "version": VERSION,
     "license": "Apache 2.0",
     "execution": {"modes": ["advisory"], "max_incidents": 100},
     "memory": {"type": "in-memory", "faiss_index_type": "flat", "max_incident_nodes": 100},
+    "enterprise_features": [
+        "Real-time HMC (using PyMC)",
+        "Hyperpriors",
+        "Decision Engine",
+        "Full audit trails & compliance reporting",
+        "Blast radius limits & automatic rollback",
+        "Multi-cloud & hybrid deployment support"
+    ]
 }
+# ----------------------------------------------------------------------
+# Graceful shutdown
+# ----------------------------------------------------------------------
+def shutdown_handler(signum, frame):
+    logger.info("Received shutdown signal, cleaning up...")
+    shutdown_event.set()
+    # Wait a moment for threads to finish
+    time.sleep(2)
+    logger.info("Shutdown complete")
+    sys.exit(0)
+signal.signal(signal.SIGTERM, shutdown_handler)
+signal.signal(signal.SIGINT, shutdown_handler)
 # ----------------------------------------------------------------------
 # Startup
 # ----------------------------------------------------------------------
+# Ensure data directory exists and DB has secure permissions
 init_db()
 refresh_history_from_db()
 # Start periodic vacuum (once a day)
 def vacuum_scheduler():
+    while not shutdown_event.is_set():
         time.sleep(86400)  # 24 hours
+        if not shutdown_event.is_set():
+            vacuum_db()
 vacuum_thread = threading.Thread(target=vacuum_scheduler, daemon=True)
 vacuum_thread.start()
 # ----------------------------------------------------------------------
 # Gradio UI
 # ----------------------------------------------------------------------
+with gr.Blocks(title=f"ARF v{VERSION} – Bayesian Risk Scoring Demo") as demo:
     gr.Markdown(f"""
+    # 🧠 ARF v{VERSION} – Bayesian Risk Scoring for AI Reliability (Demo)
     **Mathematically rigorous risk estimation using conjugate priors and MCMC**
     This demo showcases:
     - **Bayesian conjugate prior (Beta-Binomial)** – online risk update from observed failures/successes.
             ]
             gr.JSON(label="Active Policies", value=policies_json)
+        # Sales-driven Enterprise / OSS tab
         with gr.TabItem("Enterprise / OSS"):
             gr.Markdown(f"""
+            <div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); padding: 2rem; border-radius: 12px; margin-bottom: 2rem; text-align: center; color: white;">
+                <h1 style="margin: 0; font-size: 2.5rem;">🚀 ARF {oss_caps['edition'].upper()} Edition</h1>
+                <p style="font-size: 1.2rem; opacity: 0.9;">Version {oss_caps['version']} · Apache 2.0 License</p>
+            </div>
+            <div style="display: flex; gap: 1.5rem; margin-bottom: 2rem;">
+                <div style="flex: 1; background: #f8f9fa; padding: 1.5rem; border-radius: 8px;">
+                    <h3>📦 OSS Capabilities (Demo)</h3>
+                    <ul>
+                        <li>✅ Bayesian conjugate prior – Beta-Binomial risk scoring</li>
+                        <li>✅ Policy thresholds – configurable approve/escalate/deny</li>
+                        <li>✅ MCMC sampling – Metropolis-Hastings (simulates HMC concepts)</li>
+                        <li>✅ In-memory storage – no persistence</li>
+                        <li>✅ Full open-source transparency</li>
+                    </ul>
+                </div>
+                <div style="flex: 1; background: #f8f9fa; padding: 1.5rem; border-radius: 8px;">
+                    <h3>🏢 Enterprise Features</h3>
+                    <ul>
+                        <li>🔒 Real-time HMC (using PyMC) – Bayesian deep learning for risk</li>
+                        <li>🔒 Hyperpriors – hierarchical models for better generalization</li>
+                        <li>🔒 Decision Engine with full audit trails</li>
+                        <li>🔒 Blast radius limits & automatic rollback</li>
+                        <li>🔒 Multi-cloud & hybrid deployment support</li>
+                        <li>🔒 Compliance reporting (SOC2, ISO 27001)</li>
+                        <li>🔒 24/7 enterprise support & SLAs</li>
+                    </ul>
+                </div>
+            </div>
+            <div style="background: #e9ecef; padding: 1.5rem; border-radius: 8px; text-align: center;">
+                <h3 style="margin-top: 0;">✨ Why Upgrade to Enterprise?</h3>
+                <p>ARF Enterprise delivers the same mathematically rigorous foundation but with <strong>production‑grade reliability</strong> and <strong>governance controls</strong> that meet the strictest compliance requirements.</p>
+                <ul style="display: inline-block; text-align: left; margin: 1rem auto;">
+                    <li>📊 **Persistent storage** – every decision logged and queryable</li>
+                    <li>⚙️ **Advanced risk fusion** – combine conjugate, hyperprior, and HMC estimates</li>
+                    <li>🛡️ **Semantic memory** – FAISS vector search for context‑aware policies</li>
+                    <li>📈 **Real‑time dashboards** with Grafana & Prometheus integration</li>
+                </ul>
+            </div>
+            <div style="text-align: center; margin-top: 2rem;">
+                <a href="https://calendly.com/petter2025us/30min" target="_blank" style="background: #764ba2; color: white; padding: 12px 24px; text-decoration: none; border-radius: 8px; font-weight: bold; margin-right: 1rem;">📅 Book a Demo</a>
+                <a href="mailto:petter2025us@outlook.com" style="background: #667eea; color: white; padding: 12px 24px; text-decoration: none; border-radius: 8px; font-weight: bold;">📧 Contact Sales</a>
+            </div>
             """)
     # Wire events
         outputs=[hmc_summary, hmc_trace_plot, hmc_pair_plot]
     )
+# Add Prometheus metrics endpoint
+demo.fastapi_app.add_api_route("/metrics", lambda: (generate_latest(), 200, {"Content-Type": CONTENT_TYPE_LATEST}), methods=["GET"])
 if __name__ == "__main__":
     demo.queue()
     demo.launch(theme="soft", server_name="0.0.0.0", server_port=7860)