Spaces:

A-R-F
/

Agentic-Reliability-Framework-v4

Running

App Files Files Community

petter2025 commited on 19 days ago

Commit

d9d84fd

verified ·

1 Parent(s): 5b9eca0

Update app.py

Browse files

Files changed (1) hide show

app.py +50 -117

app.py CHANGED Viewed

@@ -15,6 +15,7 @@ import sys
 import functools
 import csv
 import io
 from collections import deque
 from scipy.stats import beta
 import plotly.graph_objects as go
@@ -59,25 +60,22 @@ os.makedirs("/var/log/arf", exist_ok=True)
 logger = logging.getLogger(__name__)
 logger.setLevel(getattr(logging, LOG_LEVEL, logging.INFO))
-# File handler with rotation
 file_handler = logging.handlers.RotatingFileHandler(
     "/var/log/arf/app.log", maxBytes=10_485_760, backupCount=5
 )
 file_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
-# Console handler (for Docker logs)
 console_handler = logging.StreamHandler(sys.stdout)
 console_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
 logger.addHandler(file_handler)
 logger.addHandler(console_handler)
-logger.propagate = False  # Prevent duplicate logs
 # ----------------------------------------------------------------------
 # SQLite persistence with secure permissions
 # ----------------------------------------------------------------------
 def init_db():
-    """Create the decisions table with secure file permissions."""
     db_dir = os.path.dirname(DB_PATH)
     if db_dir and not os.path.exists(db_dir):
         os.makedirs(db_dir, exist_ok=True)
@@ -92,7 +90,6 @@ def init_db():
             )
         ''')
         conn.commit()
-    # Restrict permissions (owner read/write only) – best effort
     try:
         os.chmod(DB_PATH, 0o600)
     except Exception as e:
@@ -100,7 +97,6 @@ def init_db():
     logger.info(f"Database initialized at {DB_PATH}")
 def save_decision_to_db(decision: dict, risk: float):
-    """Insert a decision into the database."""
     try:
         with contextlib.closing(sqlite3.connect(DB_PATH)) as conn:
             cursor = conn.cursor()
@@ -113,7 +109,6 @@ def save_decision_to_db(decision: dict, risk: float):
         logger.error(f"Failed to save decision to DB: {e}")
 def load_recent_decisions(limit: int = 100) -> List[Tuple[str, dict, float]]:
-    """Load the most recent decisions from the database."""
     decisions = []
     try:
         with contextlib.closing(sqlite3.connect(DB_PATH)) as conn:
@@ -125,13 +120,12 @@ def load_recent_decisions(limit: int = 100) -> List[Tuple[str, dict, float]]:
             rows = cursor.fetchall()
             for ts, json_str, risk in rows:
                 decisions.append((ts, json.loads(json_str), risk))
-        decisions.reverse()  # oldest first
     except Exception as e:
         logger.error(f"Failed to load decisions from DB: {e}")
     return decisions
 def vacuum_db():
-    """Run VACUUM on the database (periodic maintenance)."""
     try:
         with contextlib.closing(sqlite3.connect(DB_PATH)) as conn:
             conn.execute("VACUUM")
@@ -154,7 +148,7 @@ else:
     prom_mcmc_runs = None
 # ----------------------------------------------------------------------
-# Thread‑safe history (in‑memory + DB backup)
 # ----------------------------------------------------------------------
 decision_history = []
 risk_history = []
@@ -162,23 +156,19 @@ history_lock = threading.Lock()
 shutdown_event = threading.Event()
 def update_dashboard_data(decision: dict, risk: float):
-    """Thread‑safe update of both in‑memory history and database."""
     with history_lock:
         decision_history.append((datetime.utcnow().isoformat(), decision, risk))
         risk_history.append((datetime.utcnow().isoformat(), risk))
-        # Keep only last 100 in memory
         if len(decision_history) > 100:
             decision_history.pop(0)
         if len(risk_history) > 100:
             risk_history.pop(0)
     save_decision_to_db(decision, risk)
-    # Update Prometheus metrics
     if PROMETHEUS_AVAILABLE:
         prom_decisions_total.labels(action=decision.get("risk_level", "unknown")).inc()
         prom_risk_gauge.set(risk)
 def refresh_history_from_db():
-    """Load recent history from database (called at startup)."""
     global decision_history, risk_history
     decisions = load_recent_decisions(100)
     with history_lock:
@@ -187,15 +177,13 @@ def refresh_history_from_db():
         for ts, dec, risk in decisions:
             decision_history.append((ts, dec, risk))
             risk_history.append((ts, risk))
-    # After loading, set the Prometheus gauge to the latest risk
     if PROMETHEUS_AVAILABLE and risk_history:
         prom_risk_gauge.set(risk_history[-1][1])
 # ----------------------------------------------------------------------
-# Memory monitoring (daemon thread with graceful stop)
 # ----------------------------------------------------------------------
 def get_memory_usage():
-    """Return current process memory usage in MB (RSS)."""
     try:
         import resource
         rss = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
@@ -216,7 +204,6 @@ def get_memory_usage():
     return None
 def memory_monitor_loop():
-    """Periodically log memory usage. Runs in a daemon thread."""
     while not shutdown_event.is_set():
         try:
             mem_mb = get_memory_usage()
@@ -226,40 +213,30 @@ def memory_monitor_loop():
                 logger.info("Process memory: unknown")
         except Exception as e:
             logger.error(f"Memory logging error: {e}")
-        # Sleep in small intervals to react quickly to shutdown
         for _ in range(60):
             if shutdown_event.is_set():
                 break
             time.sleep(1)
 # ----------------------------------------------------------------------
-# Bayesian Risk Engine (Beta‑Binomial) with sliding window
 # ----------------------------------------------------------------------
 class BayesianRiskEngine:
     def __init__(self, alpha=ALPHA_PRIOR, beta=BETA_PRIOR, maxlen=None):
         self.alpha = alpha
         self.beta = beta
         self.maxlen = maxlen
-        self.events = deque(maxlen=maxlen)  # store (failures, successes)
         self.total_failures = 0
         self.total_successes = 0
     def update(self, failures, successes):
-        # Add new event
         self.events.append((failures, successes))
         self.total_failures += failures
         self.total_successes += successes
-        # If maxlen is reached and the queue overflows, we've already removed the oldest,
-        # but we need to subtract it from totals.
         if self.maxlen is not None and len(self.events) == self.maxlen:
-            # The deque automatically discards the leftmost when full, but we have to
-            # manually adjust totals to reflect the discarded event.
-            # However, we can't easily know what was discarded. Instead, recompute from deque.
             self.total_failures = sum(f for f, _ in self.events)
             self.total_successes = sum(s for _, s in self.events)
-        # Set alpha,beta = prior + totals
         self.alpha = ALPHA_PRIOR + self.total_failures
         self.beta = BETA_PRIOR + self.total_successes
@@ -272,7 +249,7 @@ class BayesianRiskEngine:
         return lo, hi
 # ----------------------------------------------------------------------
-# Policy Engine (uses global thresholds)
 # ----------------------------------------------------------------------
 class PolicyEngine:
     def __init__(self):
@@ -287,16 +264,15 @@ class PolicyEngine:
             return "escalate", f"Risk in escalation zone ({self.thresholds['low']}-{self.thresholds['high']})"
 # ----------------------------------------------------------------------
-# Infrastructure analysis (synchronous, with validation and sliding window)
 # ----------------------------------------------------------------------
 def handle_infra_with_governance(fault_type: str, context_window: int, session_state: dict):
     start_time = time.time()
     try:
-        # Input validation
         fault_type = fault_type.strip()
         if fault_type not in ["none", "switch_down", "server_overload", "cascade"]:
             fault_type = "none"
-        context_window = max(0, min(1000, int(context_window)))  # clamp
         fault_map = {
             "none": (1, 99),
@@ -306,7 +282,6 @@ def handle_infra_with_governance(fault_type: str, context_window: int, session_s
         }
         failures, successes = fault_map.get(fault_type, (1, 99))
-        # Use context_window: if >0, limit to last N events; else unlimited
         maxlen = context_window if context_window > 0 else None
         risk_engine = BayesianRiskEngine(maxlen=maxlen)
         risk_engine.update(failures, successes)
@@ -340,7 +315,6 @@ def handle_infra_with_governance(fault_type: str, context_window: int, session_s
                 "control_plane_decision": control_decision
             }
         }
-        # Record latency metric
         if PROMETHEUS_AVAILABLE:
             prom_decision_latency.observe(time.time() - start_time)
         return output, session_state
@@ -350,7 +324,6 @@ def handle_infra_with_governance(fault_type: str, context_window: int, session_s
 def autonomous_control_decision(risk, risk_engine, policy_engine):
     action, reason = policy_engine.evaluate(risk)
-    # Use configurable thresholds for risk level
     risk_level = "low" if risk < LOW_THRESHOLD else "medium" if risk < HIGH_THRESHOLD else "high"
     decision = {
         "timestamp": datetime.utcnow().isoformat(),
@@ -363,7 +336,7 @@ def autonomous_control_decision(risk, risk_engine, policy_engine):
     return decision
 # ----------------------------------------------------------------------
-# MCMC (Metropolis‑Hastings) with input validation and timeout
 # ----------------------------------------------------------------------
 class MHMCMC:
     def __init__(self, log_target, proposal_sd=0.1):
@@ -390,18 +363,16 @@ class MHMCMC:
 def run_hmc_mcmc(samples: int, warmup: int):
     try:
-        # Input validation
         samples = max(500, min(10000, int(samples)))
         warmup = max(100, min(2000, int(warmup)))
         if PROMETHEUS_AVAILABLE:
-            prom_mcmc_runs.inc()  # record metric
-        # Generate data: 10 observations with mean 0.5, std 0.2
-        np.random.seed(42)  # for reproducibility
         data = np.random.normal(0.5, 0.2, 10)
         def log_prior(mu):
-            return -0.5 * (mu ** 2)   # prior N(0,1)
         def log_likelihood(mu):
             return -0.5 * np.sum(((data - mu) / 0.2) ** 2)
@@ -437,7 +408,7 @@ def run_hmc_mcmc(samples: int, warmup: int):
         return {"error": str(e)}, go.Figure(), go.Figure()
 # ----------------------------------------------------------------------
-# Dashboard plots (thread‑safe with caching)
 # ----------------------------------------------------------------------
 class TTLCache:
     def __init__(self, ttl_seconds=5):
@@ -461,7 +432,7 @@ class TTLCache:
             return result
         return wrapper
-dashboard_cache = TTLCache(ttl_seconds=2)  # cache for 2 seconds
 @dashboard_cache
 def generate_risk_gauge():
@@ -509,7 +480,6 @@ def generate_action_timeline():
 @dashboard_cache
 def generate_risk_trend():
-    """Line chart showing risk over time."""
     with history_lock:
         if not risk_history:
             return go.Figure()
@@ -517,7 +487,6 @@ def generate_risk_trend():
         risks = [r for _, r in risk_history]
     fig = go.Figure()
     fig.add_trace(go.Scatter(x=times, y=risks, mode='lines+markers', name='Risk', line=dict(color='red', width=2)))
-    # Add horizontal lines for thresholds
     fig.add_hline(y=LOW_THRESHOLD, line_dash="dash", line_color="green", annotation_text=f"Low ({LOW_THRESHOLD})")
     fig.add_hline(y=HIGH_THRESHOLD, line_dash="dash", line_color="orange", annotation_text=f"High ({HIGH_THRESHOLD})")
     fig.update_layout(title="Risk Trend", xaxis_title="Time", yaxis_title="Risk Score", yaxis_range=[0, 1])
@@ -547,45 +516,40 @@ def refresh_dashboard():
 # Batch simulation
 # ----------------------------------------------------------------------
 def run_batch_simulation(context_window: int):
-    """Run evaluation for all fault types and return a summary table and individual results."""
     fault_types = ["none", "switch_down", "server_overload", "cascade"]
     results = []
     for fault in fault_types:
-        # We'll call handle_infra_with_governance directly
-        # Since it returns (output, state) and we ignore state
         output, _ = handle_infra_with_governance(fault, context_window, {})
         if "error" in output:
-            results.append({
-                "Fault Type": fault,
-                "Risk": "Error",
-                "Decision": output["error"],
-                "Risk Level": "N/A",
-                "Confidence Interval": "N/A"
-            })
         else:
-            results.append({
-                "Fault Type": fault,
-                "Risk": f"{output['risk']:.4f}",
-                "Decision": output["decision"],
-                "Risk Level": output["governance"]["control_plane_decision"]["risk_level"],
-                "Confidence Interval": f"[{output['risk_ci'][0]:.3f}, {output['risk_ci'][1]:.3f}]"
-            })
-    # Convert to DataFrame-like display
-    summary_table = results
-    return summary_table
 # ----------------------------------------------------------------------
 # Data export
 # ----------------------------------------------------------------------
 def export_history_to_csv():
-    """Generate CSV of all decisions from database."""
     try:
         with contextlib.closing(sqlite3.connect(DB_PATH)) as conn:
             cursor = conn.cursor()
             cursor.execute("SELECT timestamp, decision_json, risk FROM decisions ORDER BY timestamp")
             rows = cursor.fetchall()
         if not rows:
-            return "No data to export", None
         output = io.StringIO()
         writer = csv.writer(output)
         writer.writerow(["Timestamp", "Decision", "Risk", "Approved", "Risk Level", "Reason"])
@@ -600,27 +564,29 @@ def export_history_to_csv():
                 dec.get("reason", "")
             ])
         output.seek(0)
-        return output.getvalue()
     except Exception as e:
         logger.error(f"Export failed: {e}")
-        return f"Export failed: {str(e)}", None
 # ----------------------------------------------------------------------
-# Update thresholds (global)
 # ----------------------------------------------------------------------
 def update_thresholds(low: float, high: float):
     global LOW_THRESHOLD, HIGH_THRESHOLD
     if 0 <= low < high <= 1:
         LOW_THRESHOLD = low
         HIGH_THRESHOLD = high
-        # Also update PolicyEngine thresholds (but PolicyEngine reads from globals, so fine)
         logger.info(f"Updated thresholds: low={low}, high={high}")
         return f"Thresholds updated: approve < {low}, escalate {low}-{high}, deny > {high}"
     else:
         return f"Invalid thresholds: low={low}, high={high}. Must satisfy 0 ≤ low < high ≤ 1."
 # ----------------------------------------------------------------------
-# OSS capabilities (mocked)
 # ----------------------------------------------------------------------
 oss_caps = {
     "edition": "OSS (Demo)",
@@ -683,10 +649,8 @@ with gr.Blocks(title=f"ARF v{VERSION} – Bayesian Risk Scoring Demo", theme=gr.
     All components are implemented with only `numpy`, `scipy`, and standard libraries.
     """)
-    # ------------------------------------------------------------------
-    # Control Plane Dashboard with auto-refresh
-    # ------------------------------------------------------------------
     with gr.Tabs():
         with gr.TabItem("Control Plane Dashboard"):
             gr.Markdown("### 🎮 Control Plane")
             with gr.Row():
@@ -715,24 +679,20 @@ with gr.Blocks(title=f"ARF v{VERSION} – Bayesian Risk Scoring Demo", theme=gr.
             with gr.Row():
                 auto_refresh = gr.Checkbox(label="Auto-refresh (3s)", value=False)
                 refresh_btn = gr.Button("Refresh Now")
-            # Auto-refresh timer
-            timer = gr.Timer(value=3, active=False)  # will be toggled
             def refresh_if_enabled(auto):
                 if auto:
                     return refresh_dashboard()
                 else:
-                    return [gr.update() for _ in range(5)]  # no update
             timer.tick(refresh_if_enabled, inputs=[auto_refresh], outputs=[control_stats, risk_gauge, decision_pie, action_timeline, risk_trend])
             refresh_btn.click(
                 fn=refresh_dashboard,
                 outputs=[control_stats, risk_gauge, decision_pie, action_timeline, risk_trend]
             )
-            # Start/stop timer based on checkbox
             auto_refresh.change(lambda v: gr.Timer(active=v), inputs=[auto_refresh], outputs=[timer])
-        # ------------------------------------------------------------------
-        # Infrastructure Reliability (with batch simulation)
-        # ------------------------------------------------------------------
         with gr.TabItem("Infrastructure Reliability"):
             gr.Markdown("### 🏗️ Infrastructure Intent Evaluation with Bayesian Risk")
             infra_state = gr.State(value={})
@@ -759,7 +719,8 @@ with gr.Blocks(title=f"ARF v{VERSION} – Bayesian Risk Scoring Demo", theme=gr.
                     infra_output = gr.JSON(label="Analysis Result")
             batch_results = gr.Dataframe(
                 headers=["Fault Type", "Risk", "Decision", "Risk Level", "Confidence Interval"],
-                label="Batch Simulation Results"
             )
             infra_btn.click(
                 fn=handle_infra_with_governance,
@@ -772,9 +733,7 @@ with gr.Blocks(title=f"ARF v{VERSION} – Bayesian Risk Scoring Demo", theme=gr.
                 outputs=[batch_results]
             )
-        # ------------------------------------------------------------------
         # Deep Analysis (MCMC)
-        # ------------------------------------------------------------------
         with gr.TabItem("Deep Analysis (MCMC)"):
             gr.Markdown("### Markov Chain Monte Carlo (Metropolis‑Hastings)")
             with gr.Row():
@@ -793,9 +752,7 @@ with gr.Blocks(title=f"ARF v{VERSION} – Bayesian Risk Scoring Demo", theme=gr.
                 outputs=[hmc_summary, hmc_trace_plot, hmc_pair_plot]
             )
-        # ------------------------------------------------------------------
-        # Policy Management (with interactive sliders)
-        # ------------------------------------------------------------------
         with gr.TabItem("Policy Management"):
             gr.Markdown("### 📋 Execution Policies")
             with gr.Row():
@@ -824,9 +781,7 @@ with gr.Blocks(title=f"ARF v{VERSION} – Bayesian Risk Scoring Demo", theme=gr.
                 outputs=[policy_display]
             )
-        # ------------------------------------------------------------------
-        # Enterprise / OSS (with data export)
-        # ------------------------------------------------------------------
         with gr.TabItem("Enterprise / OSS"):
             gr.Markdown(f"""
             <div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); padding: 2rem; border-radius: 12px; margin-bottom: 2rem; text-align: center; color: white;">
@@ -875,35 +830,13 @@ with gr.Blocks(title=f"ARF v{VERSION} – Bayesian Risk Scoring Demo", theme=gr.
                 <a href="mailto:petter2025us@outlook.com" style="background: #667eea; color: white; padding: 12px 24px; text-decoration: none; border-radius: 8px; font-weight: bold;">📧 Contact Sales</a>
             </div>
             """)
-            # Data export section
             gr.Markdown("### 📥 Export Decision History")
-            with gr.Row():
-                export_btn = gr.DownloadButton("Download CSV", variant="primary")
-                export_btn.click(
-                    fn=export_history_to_csv,
-                    outputs=[gr.File(label="decision_history.csv", visible=False)]  # hidden, but we need a file component
-                )
-            # Note: gradio DownloadButton works with a function that returns a file path or bytes.
-            # We'll create a temporary file.
-            def export_and_return_file():
-                csv_data = export_history_to_csv()
-                if csv_data and not csv_data.startswith("Export failed"):
-                    # Write to a temporary file
-                    import tempfile
-                    with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) as f:
-                        f.write(csv_data)
-                        return f.name
-                else:
-                    return None
             export_btn.click(
-                fn=export_and_return_file,
                 outputs=[gr.File(label="decision_history.csv")]
             )
-    # ------------------------------------------------------------------
-    # Wire events for infra and MCMC (already done above)
-    # ------------------------------------------------------------------
 # ----------------------------------------------------------------------
 # Launch
 # ----------------------------------------------------------------------

 import functools
 import csv
 import io
+import tempfile
 from collections import deque
 from scipy.stats import beta
 import plotly.graph_objects as go
 logger = logging.getLogger(__name__)
 logger.setLevel(getattr(logging, LOG_LEVEL, logging.INFO))
 file_handler = logging.handlers.RotatingFileHandler(
     "/var/log/arf/app.log", maxBytes=10_485_760, backupCount=5
 )
 file_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
 console_handler = logging.StreamHandler(sys.stdout)
 console_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
 logger.addHandler(file_handler)
 logger.addHandler(console_handler)
+logger.propagate = False
 # ----------------------------------------------------------------------
 # SQLite persistence with secure permissions
 # ----------------------------------------------------------------------
 def init_db():
     db_dir = os.path.dirname(DB_PATH)
     if db_dir and not os.path.exists(db_dir):
         os.makedirs(db_dir, exist_ok=True)
             )
         ''')
         conn.commit()
     try:
         os.chmod(DB_PATH, 0o600)
     except Exception as e:
     logger.info(f"Database initialized at {DB_PATH}")
 def save_decision_to_db(decision: dict, risk: float):
     try:
         with contextlib.closing(sqlite3.connect(DB_PATH)) as conn:
             cursor = conn.cursor()
         logger.error(f"Failed to save decision to DB: {e}")
 def load_recent_decisions(limit: int = 100) -> List[Tuple[str, dict, float]]:
     decisions = []
     try:
         with contextlib.closing(sqlite3.connect(DB_PATH)) as conn:
             rows = cursor.fetchall()
             for ts, json_str, risk in rows:
                 decisions.append((ts, json.loads(json_str), risk))
+        decisions.reverse()
     except Exception as e:
         logger.error(f"Failed to load decisions from DB: {e}")
     return decisions
 def vacuum_db():
     try:
         with contextlib.closing(sqlite3.connect(DB_PATH)) as conn:
             conn.execute("VACUUM")
     prom_mcmc_runs = None
 # ----------------------------------------------------------------------
+# Thread‑safe history
 # ----------------------------------------------------------------------
 decision_history = []
 risk_history = []
 shutdown_event = threading.Event()
 def update_dashboard_data(decision: dict, risk: float):
     with history_lock:
         decision_history.append((datetime.utcnow().isoformat(), decision, risk))
         risk_history.append((datetime.utcnow().isoformat(), risk))
         if len(decision_history) > 100:
             decision_history.pop(0)
         if len(risk_history) > 100:
             risk_history.pop(0)
     save_decision_to_db(decision, risk)
     if PROMETHEUS_AVAILABLE:
         prom_decisions_total.labels(action=decision.get("risk_level", "unknown")).inc()
         prom_risk_gauge.set(risk)
 def refresh_history_from_db():
     global decision_history, risk_history
     decisions = load_recent_decisions(100)
     with history_lock:
         for ts, dec, risk in decisions:
             decision_history.append((ts, dec, risk))
             risk_history.append((ts, risk))
     if PROMETHEUS_AVAILABLE and risk_history:
         prom_risk_gauge.set(risk_history[-1][1])
 # ----------------------------------------------------------------------
+# Memory monitoring
 # ----------------------------------------------------------------------
 def get_memory_usage():
     try:
         import resource
         rss = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
     return None
 def memory_monitor_loop():
     while not shutdown_event.is_set():
         try:
             mem_mb = get_memory_usage()
                 logger.info("Process memory: unknown")
         except Exception as e:
             logger.error(f"Memory logging error: {e}")
         for _ in range(60):
             if shutdown_event.is_set():
                 break
             time.sleep(1)
 # ----------------------------------------------------------------------
+# Bayesian Risk Engine
 # ----------------------------------------------------------------------
 class BayesianRiskEngine:
     def __init__(self, alpha=ALPHA_PRIOR, beta=BETA_PRIOR, maxlen=None):
         self.alpha = alpha
         self.beta = beta
         self.maxlen = maxlen
+        self.events = deque(maxlen=maxlen)
         self.total_failures = 0
         self.total_successes = 0
     def update(self, failures, successes):
         self.events.append((failures, successes))
         self.total_failures += failures
         self.total_successes += successes
         if self.maxlen is not None and len(self.events) == self.maxlen:
             self.total_failures = sum(f for f, _ in self.events)
             self.total_successes = sum(s for _, s in self.events)
         self.alpha = ALPHA_PRIOR + self.total_failures
         self.beta = BETA_PRIOR + self.total_successes
         return lo, hi
 # ----------------------------------------------------------------------
+# Policy Engine
 # ----------------------------------------------------------------------
 class PolicyEngine:
     def __init__(self):
             return "escalate", f"Risk in escalation zone ({self.thresholds['low']}-{self.thresholds['high']})"
 # ----------------------------------------------------------------------
+# Infrastructure analysis
 # ----------------------------------------------------------------------
 def handle_infra_with_governance(fault_type: str, context_window: int, session_state: dict):
     start_time = time.time()
     try:
         fault_type = fault_type.strip()
         if fault_type not in ["none", "switch_down", "server_overload", "cascade"]:
             fault_type = "none"
+        context_window = max(0, min(1000, int(context_window)))
         fault_map = {
             "none": (1, 99),
         }
         failures, successes = fault_map.get(fault_type, (1, 99))
         maxlen = context_window if context_window > 0 else None
         risk_engine = BayesianRiskEngine(maxlen=maxlen)
         risk_engine.update(failures, successes)
                 "control_plane_decision": control_decision
             }
         }
         if PROMETHEUS_AVAILABLE:
             prom_decision_latency.observe(time.time() - start_time)
         return output, session_state
 def autonomous_control_decision(risk, risk_engine, policy_engine):
     action, reason = policy_engine.evaluate(risk)
     risk_level = "low" if risk < LOW_THRESHOLD else "medium" if risk < HIGH_THRESHOLD else "high"
     decision = {
         "timestamp": datetime.utcnow().isoformat(),
     return decision
 # ----------------------------------------------------------------------
+# MCMC
 # ----------------------------------------------------------------------
 class MHMCMC:
     def __init__(self, log_target, proposal_sd=0.1):
 def run_hmc_mcmc(samples: int, warmup: int):
     try:
         samples = max(500, min(10000, int(samples)))
         warmup = max(100, min(2000, int(warmup)))
         if PROMETHEUS_AVAILABLE:
+            prom_mcmc_runs.inc()
+        np.random.seed(42)
         data = np.random.normal(0.5, 0.2, 10)
         def log_prior(mu):
+            return -0.5 * (mu ** 2)
         def log_likelihood(mu):
             return -0.5 * np.sum(((data - mu) / 0.2) ** 2)
         return {"error": str(e)}, go.Figure(), go.Figure()
 # ----------------------------------------------------------------------
+# Dashboard plots
 # ----------------------------------------------------------------------
 class TTLCache:
     def __init__(self, ttl_seconds=5):
             return result
         return wrapper
+dashboard_cache = TTLCache(ttl_seconds=2)
 @dashboard_cache
 def generate_risk_gauge():
 @dashboard_cache
 def generate_risk_trend():
     with history_lock:
         if not risk_history:
             return go.Figure()
         risks = [r for _, r in risk_history]
     fig = go.Figure()
     fig.add_trace(go.Scatter(x=times, y=risks, mode='lines+markers', name='Risk', line=dict(color='red', width=2)))
     fig.add_hline(y=LOW_THRESHOLD, line_dash="dash", line_color="green", annotation_text=f"Low ({LOW_THRESHOLD})")
     fig.add_hline(y=HIGH_THRESHOLD, line_dash="dash", line_color="orange", annotation_text=f"High ({HIGH_THRESHOLD})")
     fig.update_layout(title="Risk Trend", xaxis_title="Time", yaxis_title="Risk Score", yaxis_range=[0, 1])
 # Batch simulation
 # ----------------------------------------------------------------------
 def run_batch_simulation(context_window: int):
     fault_types = ["none", "switch_down", "server_overload", "cascade"]
     results = []
     for fault in fault_types:
         output, _ = handle_infra_with_governance(fault, context_window, {})
         if "error" in output:
+            results.append([
+                fault,
+                "Error",
+                output["error"],
+                "N/A",
+                "N/A"
+            ])
         else:
+            results.append([
+                fault,
+                f"{output['risk']:.4f}",
+                output["decision"],
+                output["governance"]["control_plane_decision"]["risk_level"],
+                f"[{output['risk_ci'][0]:.3f}, {output['risk_ci'][1]:.3f}]"
+            ])
+    # Return as list of lists for gr.Dataframe
+    return results
 # ----------------------------------------------------------------------
 # Data export
 # ----------------------------------------------------------------------
 def export_history_to_csv():
     try:
         with contextlib.closing(sqlite3.connect(DB_PATH)) as conn:
             cursor = conn.cursor()
             cursor.execute("SELECT timestamp, decision_json, risk FROM decisions ORDER BY timestamp")
             rows = cursor.fetchall()
         if not rows:
+            return None
         output = io.StringIO()
         writer = csv.writer(output)
         writer.writerow(["Timestamp", "Decision", "Risk", "Approved", "Risk Level", "Reason"])
                 dec.get("reason", "")
             ])
         output.seek(0)
+        # Write to a temporary file
+        with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) as f:
+            f.write(output.getvalue())
+            return f.name
     except Exception as e:
         logger.error(f"Export failed: {e}")
+        return None
 # ----------------------------------------------------------------------
+# Update thresholds
 # ----------------------------------------------------------------------
 def update_thresholds(low: float, high: float):
     global LOW_THRESHOLD, HIGH_THRESHOLD
     if 0 <= low < high <= 1:
         LOW_THRESHOLD = low
         HIGH_THRESHOLD = high
         logger.info(f"Updated thresholds: low={low}, high={high}")
         return f"Thresholds updated: approve < {low}, escalate {low}-{high}, deny > {high}"
     else:
         return f"Invalid thresholds: low={low}, high={high}. Must satisfy 0 ≤ low < high ≤ 1."
 # ----------------------------------------------------------------------
+# OSS capabilities
 # ----------------------------------------------------------------------
 oss_caps = {
     "edition": "OSS (Demo)",
     All components are implemented with only `numpy`, `scipy`, and standard libraries.
     """)
     with gr.Tabs():
+        # Control Plane Dashboard
         with gr.TabItem("Control Plane Dashboard"):
             gr.Markdown("### 🎮 Control Plane")
             with gr.Row():
             with gr.Row():
                 auto_refresh = gr.Checkbox(label="Auto-refresh (3s)", value=False)
                 refresh_btn = gr.Button("Refresh Now")
+            timer = gr.Timer(value=3, active=False)
             def refresh_if_enabled(auto):
                 if auto:
                     return refresh_dashboard()
                 else:
+                    return [gr.update() for _ in range(5)]
             timer.tick(refresh_if_enabled, inputs=[auto_refresh], outputs=[control_stats, risk_gauge, decision_pie, action_timeline, risk_trend])
             refresh_btn.click(
                 fn=refresh_dashboard,
                 outputs=[control_stats, risk_gauge, decision_pie, action_timeline, risk_trend]
             )
             auto_refresh.change(lambda v: gr.Timer(active=v), inputs=[auto_refresh], outputs=[timer])
+        # Infrastructure Reliability
         with gr.TabItem("Infrastructure Reliability"):
             gr.Markdown("### 🏗️ Infrastructure Intent Evaluation with Bayesian Risk")
             infra_state = gr.State(value={})
                     infra_output = gr.JSON(label="Analysis Result")
             batch_results = gr.Dataframe(
                 headers=["Fault Type", "Risk", "Decision", "Risk Level", "Confidence Interval"],
+                label="Batch Simulation Results",
+                datatype=["str", "str", "str", "str", "str"]
             )
             infra_btn.click(
                 fn=handle_infra_with_governance,
                 outputs=[batch_results]
             )
         # Deep Analysis (MCMC)
         with gr.TabItem("Deep Analysis (MCMC)"):
             gr.Markdown("### Markov Chain Monte Carlo (Metropolis‑Hastings)")
             with gr.Row():
                 outputs=[hmc_summary, hmc_trace_plot, hmc_pair_plot]
             )
+        # Policy Management
         with gr.TabItem("Policy Management"):
             gr.Markdown("### 📋 Execution Policies")
             with gr.Row():
                 outputs=[policy_display]
             )
+        # Enterprise / OSS
         with gr.TabItem("Enterprise / OSS"):
             gr.Markdown(f"""
             <div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); padding: 2rem; border-radius: 12px; margin-bottom: 2rem; text-align: center; color: white;">
                 <a href="mailto:petter2025us@outlook.com" style="background: #667eea; color: white; padding: 12px 24px; text-decoration: none; border-radius: 8px; font-weight: bold;">📧 Contact Sales</a>
             </div>
             """)
             gr.Markdown("### 📥 Export Decision History")
+            export_btn = gr.DownloadButton("Download CSV", variant="primary")
             export_btn.click(
+                fn=export_history_to_csv,
                 outputs=[gr.File(label="decision_history.csv")]
             )
 # ----------------------------------------------------------------------
 # Launch
 # ----------------------------------------------------------------------