Update app.py
Browse files
app.py
CHANGED
|
@@ -15,6 +15,7 @@ import sys
|
|
| 15 |
import functools
|
| 16 |
import csv
|
| 17 |
import io
|
|
|
|
| 18 |
from collections import deque
|
| 19 |
from scipy.stats import beta
|
| 20 |
import plotly.graph_objects as go
|
|
@@ -59,25 +60,22 @@ os.makedirs("/var/log/arf", exist_ok=True)
|
|
| 59 |
logger = logging.getLogger(__name__)
|
| 60 |
logger.setLevel(getattr(logging, LOG_LEVEL, logging.INFO))
|
| 61 |
|
| 62 |
-
# File handler with rotation
|
| 63 |
file_handler = logging.handlers.RotatingFileHandler(
|
| 64 |
"/var/log/arf/app.log", maxBytes=10_485_760, backupCount=5
|
| 65 |
)
|
| 66 |
file_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
|
| 67 |
|
| 68 |
-
# Console handler (for Docker logs)
|
| 69 |
console_handler = logging.StreamHandler(sys.stdout)
|
| 70 |
console_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
|
| 71 |
|
| 72 |
logger.addHandler(file_handler)
|
| 73 |
logger.addHandler(console_handler)
|
| 74 |
-
logger.propagate = False
|
| 75 |
|
| 76 |
# ----------------------------------------------------------------------
|
| 77 |
# SQLite persistence with secure permissions
|
| 78 |
# ----------------------------------------------------------------------
|
| 79 |
def init_db():
|
| 80 |
-
"""Create the decisions table with secure file permissions."""
|
| 81 |
db_dir = os.path.dirname(DB_PATH)
|
| 82 |
if db_dir and not os.path.exists(db_dir):
|
| 83 |
os.makedirs(db_dir, exist_ok=True)
|
|
@@ -92,7 +90,6 @@ def init_db():
|
|
| 92 |
)
|
| 93 |
''')
|
| 94 |
conn.commit()
|
| 95 |
-
# Restrict permissions (owner read/write only) – best effort
|
| 96 |
try:
|
| 97 |
os.chmod(DB_PATH, 0o600)
|
| 98 |
except Exception as e:
|
|
@@ -100,7 +97,6 @@ def init_db():
|
|
| 100 |
logger.info(f"Database initialized at {DB_PATH}")
|
| 101 |
|
| 102 |
def save_decision_to_db(decision: dict, risk: float):
|
| 103 |
-
"""Insert a decision into the database."""
|
| 104 |
try:
|
| 105 |
with contextlib.closing(sqlite3.connect(DB_PATH)) as conn:
|
| 106 |
cursor = conn.cursor()
|
|
@@ -113,7 +109,6 @@ def save_decision_to_db(decision: dict, risk: float):
|
|
| 113 |
logger.error(f"Failed to save decision to DB: {e}")
|
| 114 |
|
| 115 |
def load_recent_decisions(limit: int = 100) -> List[Tuple[str, dict, float]]:
|
| 116 |
-
"""Load the most recent decisions from the database."""
|
| 117 |
decisions = []
|
| 118 |
try:
|
| 119 |
with contextlib.closing(sqlite3.connect(DB_PATH)) as conn:
|
|
@@ -125,13 +120,12 @@ def load_recent_decisions(limit: int = 100) -> List[Tuple[str, dict, float]]:
|
|
| 125 |
rows = cursor.fetchall()
|
| 126 |
for ts, json_str, risk in rows:
|
| 127 |
decisions.append((ts, json.loads(json_str), risk))
|
| 128 |
-
decisions.reverse()
|
| 129 |
except Exception as e:
|
| 130 |
logger.error(f"Failed to load decisions from DB: {e}")
|
| 131 |
return decisions
|
| 132 |
|
| 133 |
def vacuum_db():
|
| 134 |
-
"""Run VACUUM on the database (periodic maintenance)."""
|
| 135 |
try:
|
| 136 |
with contextlib.closing(sqlite3.connect(DB_PATH)) as conn:
|
| 137 |
conn.execute("VACUUM")
|
|
@@ -154,7 +148,7 @@ else:
|
|
| 154 |
prom_mcmc_runs = None
|
| 155 |
|
| 156 |
# ----------------------------------------------------------------------
|
| 157 |
-
# Thread‑safe history
|
| 158 |
# ----------------------------------------------------------------------
|
| 159 |
decision_history = []
|
| 160 |
risk_history = []
|
|
@@ -162,23 +156,19 @@ history_lock = threading.Lock()
|
|
| 162 |
shutdown_event = threading.Event()
|
| 163 |
|
| 164 |
def update_dashboard_data(decision: dict, risk: float):
|
| 165 |
-
"""Thread‑safe update of both in‑memory history and database."""
|
| 166 |
with history_lock:
|
| 167 |
decision_history.append((datetime.utcnow().isoformat(), decision, risk))
|
| 168 |
risk_history.append((datetime.utcnow().isoformat(), risk))
|
| 169 |
-
# Keep only last 100 in memory
|
| 170 |
if len(decision_history) > 100:
|
| 171 |
decision_history.pop(0)
|
| 172 |
if len(risk_history) > 100:
|
| 173 |
risk_history.pop(0)
|
| 174 |
save_decision_to_db(decision, risk)
|
| 175 |
-
# Update Prometheus metrics
|
| 176 |
if PROMETHEUS_AVAILABLE:
|
| 177 |
prom_decisions_total.labels(action=decision.get("risk_level", "unknown")).inc()
|
| 178 |
prom_risk_gauge.set(risk)
|
| 179 |
|
| 180 |
def refresh_history_from_db():
|
| 181 |
-
"""Load recent history from database (called at startup)."""
|
| 182 |
global decision_history, risk_history
|
| 183 |
decisions = load_recent_decisions(100)
|
| 184 |
with history_lock:
|
|
@@ -187,15 +177,13 @@ def refresh_history_from_db():
|
|
| 187 |
for ts, dec, risk in decisions:
|
| 188 |
decision_history.append((ts, dec, risk))
|
| 189 |
risk_history.append((ts, risk))
|
| 190 |
-
# After loading, set the Prometheus gauge to the latest risk
|
| 191 |
if PROMETHEUS_AVAILABLE and risk_history:
|
| 192 |
prom_risk_gauge.set(risk_history[-1][1])
|
| 193 |
|
| 194 |
# ----------------------------------------------------------------------
|
| 195 |
-
# Memory monitoring
|
| 196 |
# ----------------------------------------------------------------------
|
| 197 |
def get_memory_usage():
|
| 198 |
-
"""Return current process memory usage in MB (RSS)."""
|
| 199 |
try:
|
| 200 |
import resource
|
| 201 |
rss = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
|
|
@@ -216,7 +204,6 @@ def get_memory_usage():
|
|
| 216 |
return None
|
| 217 |
|
| 218 |
def memory_monitor_loop():
|
| 219 |
-
"""Periodically log memory usage. Runs in a daemon thread."""
|
| 220 |
while not shutdown_event.is_set():
|
| 221 |
try:
|
| 222 |
mem_mb = get_memory_usage()
|
|
@@ -226,40 +213,30 @@ def memory_monitor_loop():
|
|
| 226 |
logger.info("Process memory: unknown")
|
| 227 |
except Exception as e:
|
| 228 |
logger.error(f"Memory logging error: {e}")
|
| 229 |
-
# Sleep in small intervals to react quickly to shutdown
|
| 230 |
for _ in range(60):
|
| 231 |
if shutdown_event.is_set():
|
| 232 |
break
|
| 233 |
time.sleep(1)
|
| 234 |
|
| 235 |
# ----------------------------------------------------------------------
|
| 236 |
-
# Bayesian Risk Engine
|
| 237 |
# ----------------------------------------------------------------------
|
| 238 |
class BayesianRiskEngine:
|
| 239 |
def __init__(self, alpha=ALPHA_PRIOR, beta=BETA_PRIOR, maxlen=None):
|
| 240 |
self.alpha = alpha
|
| 241 |
self.beta = beta
|
| 242 |
self.maxlen = maxlen
|
| 243 |
-
self.events = deque(maxlen=maxlen)
|
| 244 |
self.total_failures = 0
|
| 245 |
self.total_successes = 0
|
| 246 |
|
| 247 |
def update(self, failures, successes):
|
| 248 |
-
# Add new event
|
| 249 |
self.events.append((failures, successes))
|
| 250 |
self.total_failures += failures
|
| 251 |
self.total_successes += successes
|
| 252 |
-
|
| 253 |
-
# If maxlen is reached and the queue overflows, we've already removed the oldest,
|
| 254 |
-
# but we need to subtract it from totals.
|
| 255 |
if self.maxlen is not None and len(self.events) == self.maxlen:
|
| 256 |
-
# The deque automatically discards the leftmost when full, but we have to
|
| 257 |
-
# manually adjust totals to reflect the discarded event.
|
| 258 |
-
# However, we can't easily know what was discarded. Instead, recompute from deque.
|
| 259 |
self.total_failures = sum(f for f, _ in self.events)
|
| 260 |
self.total_successes = sum(s for _, s in self.events)
|
| 261 |
-
|
| 262 |
-
# Set alpha,beta = prior + totals
|
| 263 |
self.alpha = ALPHA_PRIOR + self.total_failures
|
| 264 |
self.beta = BETA_PRIOR + self.total_successes
|
| 265 |
|
|
@@ -272,7 +249,7 @@ class BayesianRiskEngine:
|
|
| 272 |
return lo, hi
|
| 273 |
|
| 274 |
# ----------------------------------------------------------------------
|
| 275 |
-
# Policy Engine
|
| 276 |
# ----------------------------------------------------------------------
|
| 277 |
class PolicyEngine:
|
| 278 |
def __init__(self):
|
|
@@ -287,16 +264,15 @@ class PolicyEngine:
|
|
| 287 |
return "escalate", f"Risk in escalation zone ({self.thresholds['low']}-{self.thresholds['high']})"
|
| 288 |
|
| 289 |
# ----------------------------------------------------------------------
|
| 290 |
-
# Infrastructure analysis
|
| 291 |
# ----------------------------------------------------------------------
|
| 292 |
def handle_infra_with_governance(fault_type: str, context_window: int, session_state: dict):
|
| 293 |
start_time = time.time()
|
| 294 |
try:
|
| 295 |
-
# Input validation
|
| 296 |
fault_type = fault_type.strip()
|
| 297 |
if fault_type not in ["none", "switch_down", "server_overload", "cascade"]:
|
| 298 |
fault_type = "none"
|
| 299 |
-
context_window = max(0, min(1000, int(context_window)))
|
| 300 |
|
| 301 |
fault_map = {
|
| 302 |
"none": (1, 99),
|
|
@@ -306,7 +282,6 @@ def handle_infra_with_governance(fault_type: str, context_window: int, session_s
|
|
| 306 |
}
|
| 307 |
failures, successes = fault_map.get(fault_type, (1, 99))
|
| 308 |
|
| 309 |
-
# Use context_window: if >0, limit to last N events; else unlimited
|
| 310 |
maxlen = context_window if context_window > 0 else None
|
| 311 |
risk_engine = BayesianRiskEngine(maxlen=maxlen)
|
| 312 |
risk_engine.update(failures, successes)
|
|
@@ -340,7 +315,6 @@ def handle_infra_with_governance(fault_type: str, context_window: int, session_s
|
|
| 340 |
"control_plane_decision": control_decision
|
| 341 |
}
|
| 342 |
}
|
| 343 |
-
# Record latency metric
|
| 344 |
if PROMETHEUS_AVAILABLE:
|
| 345 |
prom_decision_latency.observe(time.time() - start_time)
|
| 346 |
return output, session_state
|
|
@@ -350,7 +324,6 @@ def handle_infra_with_governance(fault_type: str, context_window: int, session_s
|
|
| 350 |
|
| 351 |
def autonomous_control_decision(risk, risk_engine, policy_engine):
|
| 352 |
action, reason = policy_engine.evaluate(risk)
|
| 353 |
-
# Use configurable thresholds for risk level
|
| 354 |
risk_level = "low" if risk < LOW_THRESHOLD else "medium" if risk < HIGH_THRESHOLD else "high"
|
| 355 |
decision = {
|
| 356 |
"timestamp": datetime.utcnow().isoformat(),
|
|
@@ -363,7 +336,7 @@ def autonomous_control_decision(risk, risk_engine, policy_engine):
|
|
| 363 |
return decision
|
| 364 |
|
| 365 |
# ----------------------------------------------------------------------
|
| 366 |
-
# MCMC
|
| 367 |
# ----------------------------------------------------------------------
|
| 368 |
class MHMCMC:
|
| 369 |
def __init__(self, log_target, proposal_sd=0.1):
|
|
@@ -390,18 +363,16 @@ class MHMCMC:
|
|
| 390 |
|
| 391 |
def run_hmc_mcmc(samples: int, warmup: int):
|
| 392 |
try:
|
| 393 |
-
# Input validation
|
| 394 |
samples = max(500, min(10000, int(samples)))
|
| 395 |
warmup = max(100, min(2000, int(warmup)))
|
| 396 |
if PROMETHEUS_AVAILABLE:
|
| 397 |
-
prom_mcmc_runs.inc()
|
| 398 |
|
| 399 |
-
|
| 400 |
-
np.random.seed(42) # for reproducibility
|
| 401 |
data = np.random.normal(0.5, 0.2, 10)
|
| 402 |
|
| 403 |
def log_prior(mu):
|
| 404 |
-
return -0.5 * (mu ** 2)
|
| 405 |
|
| 406 |
def log_likelihood(mu):
|
| 407 |
return -0.5 * np.sum(((data - mu) / 0.2) ** 2)
|
|
@@ -437,7 +408,7 @@ def run_hmc_mcmc(samples: int, warmup: int):
|
|
| 437 |
return {"error": str(e)}, go.Figure(), go.Figure()
|
| 438 |
|
| 439 |
# ----------------------------------------------------------------------
|
| 440 |
-
# Dashboard plots
|
| 441 |
# ----------------------------------------------------------------------
|
| 442 |
class TTLCache:
|
| 443 |
def __init__(self, ttl_seconds=5):
|
|
@@ -461,7 +432,7 @@ class TTLCache:
|
|
| 461 |
return result
|
| 462 |
return wrapper
|
| 463 |
|
| 464 |
-
dashboard_cache = TTLCache(ttl_seconds=2)
|
| 465 |
|
| 466 |
@dashboard_cache
|
| 467 |
def generate_risk_gauge():
|
|
@@ -509,7 +480,6 @@ def generate_action_timeline():
|
|
| 509 |
|
| 510 |
@dashboard_cache
|
| 511 |
def generate_risk_trend():
|
| 512 |
-
"""Line chart showing risk over time."""
|
| 513 |
with history_lock:
|
| 514 |
if not risk_history:
|
| 515 |
return go.Figure()
|
|
@@ -517,7 +487,6 @@ def generate_risk_trend():
|
|
| 517 |
risks = [r for _, r in risk_history]
|
| 518 |
fig = go.Figure()
|
| 519 |
fig.add_trace(go.Scatter(x=times, y=risks, mode='lines+markers', name='Risk', line=dict(color='red', width=2)))
|
| 520 |
-
# Add horizontal lines for thresholds
|
| 521 |
fig.add_hline(y=LOW_THRESHOLD, line_dash="dash", line_color="green", annotation_text=f"Low ({LOW_THRESHOLD})")
|
| 522 |
fig.add_hline(y=HIGH_THRESHOLD, line_dash="dash", line_color="orange", annotation_text=f"High ({HIGH_THRESHOLD})")
|
| 523 |
fig.update_layout(title="Risk Trend", xaxis_title="Time", yaxis_title="Risk Score", yaxis_range=[0, 1])
|
|
@@ -547,45 +516,40 @@ def refresh_dashboard():
|
|
| 547 |
# Batch simulation
|
| 548 |
# ----------------------------------------------------------------------
|
| 549 |
def run_batch_simulation(context_window: int):
|
| 550 |
-
"""Run evaluation for all fault types and return a summary table and individual results."""
|
| 551 |
fault_types = ["none", "switch_down", "server_overload", "cascade"]
|
| 552 |
results = []
|
| 553 |
for fault in fault_types:
|
| 554 |
-
# We'll call handle_infra_with_governance directly
|
| 555 |
-
# Since it returns (output, state) and we ignore state
|
| 556 |
output, _ = handle_infra_with_governance(fault, context_window, {})
|
| 557 |
if "error" in output:
|
| 558 |
-
results.append(
|
| 559 |
-
|
| 560 |
-
"
|
| 561 |
-
|
| 562 |
-
"
|
| 563 |
-
"
|
| 564 |
-
|
| 565 |
else:
|
| 566 |
-
results.append(
|
| 567 |
-
|
| 568 |
-
|
| 569 |
-
|
| 570 |
-
|
| 571 |
-
|
| 572 |
-
|
| 573 |
-
#
|
| 574 |
-
|
| 575 |
-
return summary_table
|
| 576 |
|
| 577 |
# ----------------------------------------------------------------------
|
| 578 |
# Data export
|
| 579 |
# ----------------------------------------------------------------------
|
| 580 |
def export_history_to_csv():
|
| 581 |
-
"""Generate CSV of all decisions from database."""
|
| 582 |
try:
|
| 583 |
with contextlib.closing(sqlite3.connect(DB_PATH)) as conn:
|
| 584 |
cursor = conn.cursor()
|
| 585 |
cursor.execute("SELECT timestamp, decision_json, risk FROM decisions ORDER BY timestamp")
|
| 586 |
rows = cursor.fetchall()
|
| 587 |
if not rows:
|
| 588 |
-
return
|
| 589 |
output = io.StringIO()
|
| 590 |
writer = csv.writer(output)
|
| 591 |
writer.writerow(["Timestamp", "Decision", "Risk", "Approved", "Risk Level", "Reason"])
|
|
@@ -600,27 +564,29 @@ def export_history_to_csv():
|
|
| 600 |
dec.get("reason", "")
|
| 601 |
])
|
| 602 |
output.seek(0)
|
| 603 |
-
|
|
|
|
|
|
|
|
|
|
| 604 |
except Exception as e:
|
| 605 |
logger.error(f"Export failed: {e}")
|
| 606 |
-
return
|
| 607 |
|
| 608 |
# ----------------------------------------------------------------------
|
| 609 |
-
# Update thresholds
|
| 610 |
# ----------------------------------------------------------------------
|
| 611 |
def update_thresholds(low: float, high: float):
|
| 612 |
global LOW_THRESHOLD, HIGH_THRESHOLD
|
| 613 |
if 0 <= low < high <= 1:
|
| 614 |
LOW_THRESHOLD = low
|
| 615 |
HIGH_THRESHOLD = high
|
| 616 |
-
# Also update PolicyEngine thresholds (but PolicyEngine reads from globals, so fine)
|
| 617 |
logger.info(f"Updated thresholds: low={low}, high={high}")
|
| 618 |
return f"Thresholds updated: approve < {low}, escalate {low}-{high}, deny > {high}"
|
| 619 |
else:
|
| 620 |
return f"Invalid thresholds: low={low}, high={high}. Must satisfy 0 ≤ low < high ≤ 1."
|
| 621 |
|
| 622 |
# ----------------------------------------------------------------------
|
| 623 |
-
# OSS capabilities
|
| 624 |
# ----------------------------------------------------------------------
|
| 625 |
oss_caps = {
|
| 626 |
"edition": "OSS (Demo)",
|
|
@@ -683,10 +649,8 @@ with gr.Blocks(title=f"ARF v{VERSION} – Bayesian Risk Scoring Demo", theme=gr.
|
|
| 683 |
All components are implemented with only `numpy`, `scipy`, and standard libraries.
|
| 684 |
""")
|
| 685 |
|
| 686 |
-
# ------------------------------------------------------------------
|
| 687 |
-
# Control Plane Dashboard with auto-refresh
|
| 688 |
-
# ------------------------------------------------------------------
|
| 689 |
with gr.Tabs():
|
|
|
|
| 690 |
with gr.TabItem("Control Plane Dashboard"):
|
| 691 |
gr.Markdown("### 🎮 Control Plane")
|
| 692 |
with gr.Row():
|
|
@@ -715,24 +679,20 @@ with gr.Blocks(title=f"ARF v{VERSION} – Bayesian Risk Scoring Demo", theme=gr.
|
|
| 715 |
with gr.Row():
|
| 716 |
auto_refresh = gr.Checkbox(label="Auto-refresh (3s)", value=False)
|
| 717 |
refresh_btn = gr.Button("Refresh Now")
|
| 718 |
-
|
| 719 |
-
timer = gr.Timer(value=3, active=False) # will be toggled
|
| 720 |
def refresh_if_enabled(auto):
|
| 721 |
if auto:
|
| 722 |
return refresh_dashboard()
|
| 723 |
else:
|
| 724 |
-
return [gr.update() for _ in range(5)]
|
| 725 |
timer.tick(refresh_if_enabled, inputs=[auto_refresh], outputs=[control_stats, risk_gauge, decision_pie, action_timeline, risk_trend])
|
| 726 |
refresh_btn.click(
|
| 727 |
fn=refresh_dashboard,
|
| 728 |
outputs=[control_stats, risk_gauge, decision_pie, action_timeline, risk_trend]
|
| 729 |
)
|
| 730 |
-
# Start/stop timer based on checkbox
|
| 731 |
auto_refresh.change(lambda v: gr.Timer(active=v), inputs=[auto_refresh], outputs=[timer])
|
| 732 |
|
| 733 |
-
#
|
| 734 |
-
# Infrastructure Reliability (with batch simulation)
|
| 735 |
-
# ------------------------------------------------------------------
|
| 736 |
with gr.TabItem("Infrastructure Reliability"):
|
| 737 |
gr.Markdown("### 🏗️ Infrastructure Intent Evaluation with Bayesian Risk")
|
| 738 |
infra_state = gr.State(value={})
|
|
@@ -759,7 +719,8 @@ with gr.Blocks(title=f"ARF v{VERSION} – Bayesian Risk Scoring Demo", theme=gr.
|
|
| 759 |
infra_output = gr.JSON(label="Analysis Result")
|
| 760 |
batch_results = gr.Dataframe(
|
| 761 |
headers=["Fault Type", "Risk", "Decision", "Risk Level", "Confidence Interval"],
|
| 762 |
-
label="Batch Simulation Results"
|
|
|
|
| 763 |
)
|
| 764 |
infra_btn.click(
|
| 765 |
fn=handle_infra_with_governance,
|
|
@@ -772,9 +733,7 @@ with gr.Blocks(title=f"ARF v{VERSION} – Bayesian Risk Scoring Demo", theme=gr.
|
|
| 772 |
outputs=[batch_results]
|
| 773 |
)
|
| 774 |
|
| 775 |
-
# ------------------------------------------------------------------
|
| 776 |
# Deep Analysis (MCMC)
|
| 777 |
-
# ------------------------------------------------------------------
|
| 778 |
with gr.TabItem("Deep Analysis (MCMC)"):
|
| 779 |
gr.Markdown("### Markov Chain Monte Carlo (Metropolis‑Hastings)")
|
| 780 |
with gr.Row():
|
|
@@ -793,9 +752,7 @@ with gr.Blocks(title=f"ARF v{VERSION} – Bayesian Risk Scoring Demo", theme=gr.
|
|
| 793 |
outputs=[hmc_summary, hmc_trace_plot, hmc_pair_plot]
|
| 794 |
)
|
| 795 |
|
| 796 |
-
#
|
| 797 |
-
# Policy Management (with interactive sliders)
|
| 798 |
-
# ------------------------------------------------------------------
|
| 799 |
with gr.TabItem("Policy Management"):
|
| 800 |
gr.Markdown("### 📋 Execution Policies")
|
| 801 |
with gr.Row():
|
|
@@ -824,9 +781,7 @@ with gr.Blocks(title=f"ARF v{VERSION} – Bayesian Risk Scoring Demo", theme=gr.
|
|
| 824 |
outputs=[policy_display]
|
| 825 |
)
|
| 826 |
|
| 827 |
-
#
|
| 828 |
-
# Enterprise / OSS (with data export)
|
| 829 |
-
# ------------------------------------------------------------------
|
| 830 |
with gr.TabItem("Enterprise / OSS"):
|
| 831 |
gr.Markdown(f"""
|
| 832 |
<div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); padding: 2rem; border-radius: 12px; margin-bottom: 2rem; text-align: center; color: white;">
|
|
@@ -875,35 +830,13 @@ with gr.Blocks(title=f"ARF v{VERSION} – Bayesian Risk Scoring Demo", theme=gr.
|
|
| 875 |
<a href="mailto:petter2025us@outlook.com" style="background: #667eea; color: white; padding: 12px 24px; text-decoration: none; border-radius: 8px; font-weight: bold;">📧 Contact Sales</a>
|
| 876 |
</div>
|
| 877 |
""")
|
| 878 |
-
# Data export section
|
| 879 |
gr.Markdown("### 📥 Export Decision History")
|
| 880 |
-
|
| 881 |
-
export_btn = gr.DownloadButton("Download CSV", variant="primary")
|
| 882 |
-
export_btn.click(
|
| 883 |
-
fn=export_history_to_csv,
|
| 884 |
-
outputs=[gr.File(label="decision_history.csv", visible=False)] # hidden, but we need a file component
|
| 885 |
-
)
|
| 886 |
-
# Note: gradio DownloadButton works with a function that returns a file path or bytes.
|
| 887 |
-
# We'll create a temporary file.
|
| 888 |
-
def export_and_return_file():
|
| 889 |
-
csv_data = export_history_to_csv()
|
| 890 |
-
if csv_data and not csv_data.startswith("Export failed"):
|
| 891 |
-
# Write to a temporary file
|
| 892 |
-
import tempfile
|
| 893 |
-
with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) as f:
|
| 894 |
-
f.write(csv_data)
|
| 895 |
-
return f.name
|
| 896 |
-
else:
|
| 897 |
-
return None
|
| 898 |
export_btn.click(
|
| 899 |
-
fn=
|
| 900 |
outputs=[gr.File(label="decision_history.csv")]
|
| 901 |
)
|
| 902 |
|
| 903 |
-
# ------------------------------------------------------------------
|
| 904 |
-
# Wire events for infra and MCMC (already done above)
|
| 905 |
-
# ------------------------------------------------------------------
|
| 906 |
-
|
| 907 |
# ----------------------------------------------------------------------
|
| 908 |
# Launch
|
| 909 |
# ----------------------------------------------------------------------
|
|
|
|
| 15 |
import functools
|
| 16 |
import csv
|
| 17 |
import io
|
| 18 |
+
import tempfile
|
| 19 |
from collections import deque
|
| 20 |
from scipy.stats import beta
|
| 21 |
import plotly.graph_objects as go
|
|
|
|
| 60 |
logger = logging.getLogger(__name__)
|
| 61 |
logger.setLevel(getattr(logging, LOG_LEVEL, logging.INFO))
|
| 62 |
|
|
|
|
| 63 |
file_handler = logging.handlers.RotatingFileHandler(
|
| 64 |
"/var/log/arf/app.log", maxBytes=10_485_760, backupCount=5
|
| 65 |
)
|
| 66 |
file_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
|
| 67 |
|
|
|
|
| 68 |
console_handler = logging.StreamHandler(sys.stdout)
|
| 69 |
console_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
|
| 70 |
|
| 71 |
logger.addHandler(file_handler)
|
| 72 |
logger.addHandler(console_handler)
|
| 73 |
+
logger.propagate = False
|
| 74 |
|
| 75 |
# ----------------------------------------------------------------------
|
| 76 |
# SQLite persistence with secure permissions
|
| 77 |
# ----------------------------------------------------------------------
|
| 78 |
def init_db():
|
|
|
|
| 79 |
db_dir = os.path.dirname(DB_PATH)
|
| 80 |
if db_dir and not os.path.exists(db_dir):
|
| 81 |
os.makedirs(db_dir, exist_ok=True)
|
|
|
|
| 90 |
)
|
| 91 |
''')
|
| 92 |
conn.commit()
|
|
|
|
| 93 |
try:
|
| 94 |
os.chmod(DB_PATH, 0o600)
|
| 95 |
except Exception as e:
|
|
|
|
| 97 |
logger.info(f"Database initialized at {DB_PATH}")
|
| 98 |
|
| 99 |
def save_decision_to_db(decision: dict, risk: float):
|
|
|
|
| 100 |
try:
|
| 101 |
with contextlib.closing(sqlite3.connect(DB_PATH)) as conn:
|
| 102 |
cursor = conn.cursor()
|
|
|
|
| 109 |
logger.error(f"Failed to save decision to DB: {e}")
|
| 110 |
|
| 111 |
def load_recent_decisions(limit: int = 100) -> List[Tuple[str, dict, float]]:
|
|
|
|
| 112 |
decisions = []
|
| 113 |
try:
|
| 114 |
with contextlib.closing(sqlite3.connect(DB_PATH)) as conn:
|
|
|
|
| 120 |
rows = cursor.fetchall()
|
| 121 |
for ts, json_str, risk in rows:
|
| 122 |
decisions.append((ts, json.loads(json_str), risk))
|
| 123 |
+
decisions.reverse()
|
| 124 |
except Exception as e:
|
| 125 |
logger.error(f"Failed to load decisions from DB: {e}")
|
| 126 |
return decisions
|
| 127 |
|
| 128 |
def vacuum_db():
|
|
|
|
| 129 |
try:
|
| 130 |
with contextlib.closing(sqlite3.connect(DB_PATH)) as conn:
|
| 131 |
conn.execute("VACUUM")
|
|
|
|
| 148 |
prom_mcmc_runs = None
|
| 149 |
|
| 150 |
# ----------------------------------------------------------------------
|
| 151 |
+
# Thread‑safe history
|
| 152 |
# ----------------------------------------------------------------------
|
| 153 |
decision_history = []
|
| 154 |
risk_history = []
|
|
|
|
| 156 |
shutdown_event = threading.Event()
|
| 157 |
|
| 158 |
def update_dashboard_data(decision: dict, risk: float):
|
|
|
|
| 159 |
with history_lock:
|
| 160 |
decision_history.append((datetime.utcnow().isoformat(), decision, risk))
|
| 161 |
risk_history.append((datetime.utcnow().isoformat(), risk))
|
|
|
|
| 162 |
if len(decision_history) > 100:
|
| 163 |
decision_history.pop(0)
|
| 164 |
if len(risk_history) > 100:
|
| 165 |
risk_history.pop(0)
|
| 166 |
save_decision_to_db(decision, risk)
|
|
|
|
| 167 |
if PROMETHEUS_AVAILABLE:
|
| 168 |
prom_decisions_total.labels(action=decision.get("risk_level", "unknown")).inc()
|
| 169 |
prom_risk_gauge.set(risk)
|
| 170 |
|
| 171 |
def refresh_history_from_db():
|
|
|
|
| 172 |
global decision_history, risk_history
|
| 173 |
decisions = load_recent_decisions(100)
|
| 174 |
with history_lock:
|
|
|
|
| 177 |
for ts, dec, risk in decisions:
|
| 178 |
decision_history.append((ts, dec, risk))
|
| 179 |
risk_history.append((ts, risk))
|
|
|
|
| 180 |
if PROMETHEUS_AVAILABLE and risk_history:
|
| 181 |
prom_risk_gauge.set(risk_history[-1][1])
|
| 182 |
|
| 183 |
# ----------------------------------------------------------------------
|
| 184 |
+
# Memory monitoring
|
| 185 |
# ----------------------------------------------------------------------
|
| 186 |
def get_memory_usage():
|
|
|
|
| 187 |
try:
|
| 188 |
import resource
|
| 189 |
rss = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
|
|
|
|
| 204 |
return None
|
| 205 |
|
| 206 |
def memory_monitor_loop():
|
|
|
|
| 207 |
while not shutdown_event.is_set():
|
| 208 |
try:
|
| 209 |
mem_mb = get_memory_usage()
|
|
|
|
| 213 |
logger.info("Process memory: unknown")
|
| 214 |
except Exception as e:
|
| 215 |
logger.error(f"Memory logging error: {e}")
|
|
|
|
| 216 |
for _ in range(60):
|
| 217 |
if shutdown_event.is_set():
|
| 218 |
break
|
| 219 |
time.sleep(1)
|
| 220 |
|
| 221 |
# ----------------------------------------------------------------------
|
| 222 |
+
# Bayesian Risk Engine
|
| 223 |
# ----------------------------------------------------------------------
|
| 224 |
class BayesianRiskEngine:
|
| 225 |
def __init__(self, alpha=ALPHA_PRIOR, beta=BETA_PRIOR, maxlen=None):
|
| 226 |
self.alpha = alpha
|
| 227 |
self.beta = beta
|
| 228 |
self.maxlen = maxlen
|
| 229 |
+
self.events = deque(maxlen=maxlen)
|
| 230 |
self.total_failures = 0
|
| 231 |
self.total_successes = 0
|
| 232 |
|
| 233 |
def update(self, failures, successes):
|
|
|
|
| 234 |
self.events.append((failures, successes))
|
| 235 |
self.total_failures += failures
|
| 236 |
self.total_successes += successes
|
|
|
|
|
|
|
|
|
|
| 237 |
if self.maxlen is not None and len(self.events) == self.maxlen:
|
|
|
|
|
|
|
|
|
|
| 238 |
self.total_failures = sum(f for f, _ in self.events)
|
| 239 |
self.total_successes = sum(s for _, s in self.events)
|
|
|
|
|
|
|
| 240 |
self.alpha = ALPHA_PRIOR + self.total_failures
|
| 241 |
self.beta = BETA_PRIOR + self.total_successes
|
| 242 |
|
|
|
|
| 249 |
return lo, hi
|
| 250 |
|
| 251 |
# ----------------------------------------------------------------------
|
| 252 |
+
# Policy Engine
|
| 253 |
# ----------------------------------------------------------------------
|
| 254 |
class PolicyEngine:
|
| 255 |
def __init__(self):
|
|
|
|
| 264 |
return "escalate", f"Risk in escalation zone ({self.thresholds['low']}-{self.thresholds['high']})"
|
| 265 |
|
| 266 |
# ----------------------------------------------------------------------
|
| 267 |
+
# Infrastructure analysis
|
| 268 |
# ----------------------------------------------------------------------
|
| 269 |
def handle_infra_with_governance(fault_type: str, context_window: int, session_state: dict):
|
| 270 |
start_time = time.time()
|
| 271 |
try:
|
|
|
|
| 272 |
fault_type = fault_type.strip()
|
| 273 |
if fault_type not in ["none", "switch_down", "server_overload", "cascade"]:
|
| 274 |
fault_type = "none"
|
| 275 |
+
context_window = max(0, min(1000, int(context_window)))
|
| 276 |
|
| 277 |
fault_map = {
|
| 278 |
"none": (1, 99),
|
|
|
|
| 282 |
}
|
| 283 |
failures, successes = fault_map.get(fault_type, (1, 99))
|
| 284 |
|
|
|
|
| 285 |
maxlen = context_window if context_window > 0 else None
|
| 286 |
risk_engine = BayesianRiskEngine(maxlen=maxlen)
|
| 287 |
risk_engine.update(failures, successes)
|
|
|
|
| 315 |
"control_plane_decision": control_decision
|
| 316 |
}
|
| 317 |
}
|
|
|
|
| 318 |
if PROMETHEUS_AVAILABLE:
|
| 319 |
prom_decision_latency.observe(time.time() - start_time)
|
| 320 |
return output, session_state
|
|
|
|
| 324 |
|
| 325 |
def autonomous_control_decision(risk, risk_engine, policy_engine):
|
| 326 |
action, reason = policy_engine.evaluate(risk)
|
|
|
|
| 327 |
risk_level = "low" if risk < LOW_THRESHOLD else "medium" if risk < HIGH_THRESHOLD else "high"
|
| 328 |
decision = {
|
| 329 |
"timestamp": datetime.utcnow().isoformat(),
|
|
|
|
| 336 |
return decision
|
| 337 |
|
| 338 |
# ----------------------------------------------------------------------
|
| 339 |
+
# MCMC
|
| 340 |
# ----------------------------------------------------------------------
|
| 341 |
class MHMCMC:
|
| 342 |
def __init__(self, log_target, proposal_sd=0.1):
|
|
|
|
| 363 |
|
| 364 |
def run_hmc_mcmc(samples: int, warmup: int):
|
| 365 |
try:
|
|
|
|
| 366 |
samples = max(500, min(10000, int(samples)))
|
| 367 |
warmup = max(100, min(2000, int(warmup)))
|
| 368 |
if PROMETHEUS_AVAILABLE:
|
| 369 |
+
prom_mcmc_runs.inc()
|
| 370 |
|
| 371 |
+
np.random.seed(42)
|
|
|
|
| 372 |
data = np.random.normal(0.5, 0.2, 10)
|
| 373 |
|
| 374 |
def log_prior(mu):
|
| 375 |
+
return -0.5 * (mu ** 2)
|
| 376 |
|
| 377 |
def log_likelihood(mu):
|
| 378 |
return -0.5 * np.sum(((data - mu) / 0.2) ** 2)
|
|
|
|
| 408 |
return {"error": str(e)}, go.Figure(), go.Figure()
|
| 409 |
|
| 410 |
# ----------------------------------------------------------------------
|
| 411 |
+
# Dashboard plots
|
| 412 |
# ----------------------------------------------------------------------
|
| 413 |
class TTLCache:
|
| 414 |
def __init__(self, ttl_seconds=5):
|
|
|
|
| 432 |
return result
|
| 433 |
return wrapper
|
| 434 |
|
| 435 |
+
dashboard_cache = TTLCache(ttl_seconds=2)
|
| 436 |
|
| 437 |
@dashboard_cache
|
| 438 |
def generate_risk_gauge():
|
|
|
|
| 480 |
|
| 481 |
@dashboard_cache
|
| 482 |
def generate_risk_trend():
|
|
|
|
| 483 |
with history_lock:
|
| 484 |
if not risk_history:
|
| 485 |
return go.Figure()
|
|
|
|
| 487 |
risks = [r for _, r in risk_history]
|
| 488 |
fig = go.Figure()
|
| 489 |
fig.add_trace(go.Scatter(x=times, y=risks, mode='lines+markers', name='Risk', line=dict(color='red', width=2)))
|
|
|
|
| 490 |
fig.add_hline(y=LOW_THRESHOLD, line_dash="dash", line_color="green", annotation_text=f"Low ({LOW_THRESHOLD})")
|
| 491 |
fig.add_hline(y=HIGH_THRESHOLD, line_dash="dash", line_color="orange", annotation_text=f"High ({HIGH_THRESHOLD})")
|
| 492 |
fig.update_layout(title="Risk Trend", xaxis_title="Time", yaxis_title="Risk Score", yaxis_range=[0, 1])
|
|
|
|
| 516 |
# Batch simulation
|
| 517 |
# ----------------------------------------------------------------------
|
| 518 |
def run_batch_simulation(context_window: int):
|
|
|
|
| 519 |
fault_types = ["none", "switch_down", "server_overload", "cascade"]
|
| 520 |
results = []
|
| 521 |
for fault in fault_types:
|
|
|
|
|
|
|
| 522 |
output, _ = handle_infra_with_governance(fault, context_window, {})
|
| 523 |
if "error" in output:
|
| 524 |
+
results.append([
|
| 525 |
+
fault,
|
| 526 |
+
"Error",
|
| 527 |
+
output["error"],
|
| 528 |
+
"N/A",
|
| 529 |
+
"N/A"
|
| 530 |
+
])
|
| 531 |
else:
|
| 532 |
+
results.append([
|
| 533 |
+
fault,
|
| 534 |
+
f"{output['risk']:.4f}",
|
| 535 |
+
output["decision"],
|
| 536 |
+
output["governance"]["control_plane_decision"]["risk_level"],
|
| 537 |
+
f"[{output['risk_ci'][0]:.3f}, {output['risk_ci'][1]:.3f}]"
|
| 538 |
+
])
|
| 539 |
+
# Return as list of lists for gr.Dataframe
|
| 540 |
+
return results
|
|
|
|
| 541 |
|
| 542 |
# ----------------------------------------------------------------------
|
| 543 |
# Data export
|
| 544 |
# ----------------------------------------------------------------------
|
| 545 |
def export_history_to_csv():
|
|
|
|
| 546 |
try:
|
| 547 |
with contextlib.closing(sqlite3.connect(DB_PATH)) as conn:
|
| 548 |
cursor = conn.cursor()
|
| 549 |
cursor.execute("SELECT timestamp, decision_json, risk FROM decisions ORDER BY timestamp")
|
| 550 |
rows = cursor.fetchall()
|
| 551 |
if not rows:
|
| 552 |
+
return None
|
| 553 |
output = io.StringIO()
|
| 554 |
writer = csv.writer(output)
|
| 555 |
writer.writerow(["Timestamp", "Decision", "Risk", "Approved", "Risk Level", "Reason"])
|
|
|
|
| 564 |
dec.get("reason", "")
|
| 565 |
])
|
| 566 |
output.seek(0)
|
| 567 |
+
# Write to a temporary file
|
| 568 |
+
with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) as f:
|
| 569 |
+
f.write(output.getvalue())
|
| 570 |
+
return f.name
|
| 571 |
except Exception as e:
|
| 572 |
logger.error(f"Export failed: {e}")
|
| 573 |
+
return None
|
| 574 |
|
| 575 |
# ----------------------------------------------------------------------
|
| 576 |
+
# Update thresholds
|
| 577 |
# ----------------------------------------------------------------------
|
| 578 |
def update_thresholds(low: float, high: float):
|
| 579 |
global LOW_THRESHOLD, HIGH_THRESHOLD
|
| 580 |
if 0 <= low < high <= 1:
|
| 581 |
LOW_THRESHOLD = low
|
| 582 |
HIGH_THRESHOLD = high
|
|
|
|
| 583 |
logger.info(f"Updated thresholds: low={low}, high={high}")
|
| 584 |
return f"Thresholds updated: approve < {low}, escalate {low}-{high}, deny > {high}"
|
| 585 |
else:
|
| 586 |
return f"Invalid thresholds: low={low}, high={high}. Must satisfy 0 ≤ low < high ≤ 1."
|
| 587 |
|
| 588 |
# ----------------------------------------------------------------------
|
| 589 |
+
# OSS capabilities
|
| 590 |
# ----------------------------------------------------------------------
|
| 591 |
oss_caps = {
|
| 592 |
"edition": "OSS (Demo)",
|
|
|
|
| 649 |
All components are implemented with only `numpy`, `scipy`, and standard libraries.
|
| 650 |
""")
|
| 651 |
|
|
|
|
|
|
|
|
|
|
| 652 |
with gr.Tabs():
|
| 653 |
+
# Control Plane Dashboard
|
| 654 |
with gr.TabItem("Control Plane Dashboard"):
|
| 655 |
gr.Markdown("### 🎮 Control Plane")
|
| 656 |
with gr.Row():
|
|
|
|
| 679 |
with gr.Row():
|
| 680 |
auto_refresh = gr.Checkbox(label="Auto-refresh (3s)", value=False)
|
| 681 |
refresh_btn = gr.Button("Refresh Now")
|
| 682 |
+
timer = gr.Timer(value=3, active=False)
|
|
|
|
| 683 |
def refresh_if_enabled(auto):
|
| 684 |
if auto:
|
| 685 |
return refresh_dashboard()
|
| 686 |
else:
|
| 687 |
+
return [gr.update() for _ in range(5)]
|
| 688 |
timer.tick(refresh_if_enabled, inputs=[auto_refresh], outputs=[control_stats, risk_gauge, decision_pie, action_timeline, risk_trend])
|
| 689 |
refresh_btn.click(
|
| 690 |
fn=refresh_dashboard,
|
| 691 |
outputs=[control_stats, risk_gauge, decision_pie, action_timeline, risk_trend]
|
| 692 |
)
|
|
|
|
| 693 |
auto_refresh.change(lambda v: gr.Timer(active=v), inputs=[auto_refresh], outputs=[timer])
|
| 694 |
|
| 695 |
+
# Infrastructure Reliability
|
|
|
|
|
|
|
| 696 |
with gr.TabItem("Infrastructure Reliability"):
|
| 697 |
gr.Markdown("### 🏗️ Infrastructure Intent Evaluation with Bayesian Risk")
|
| 698 |
infra_state = gr.State(value={})
|
|
|
|
| 719 |
infra_output = gr.JSON(label="Analysis Result")
|
| 720 |
batch_results = gr.Dataframe(
|
| 721 |
headers=["Fault Type", "Risk", "Decision", "Risk Level", "Confidence Interval"],
|
| 722 |
+
label="Batch Simulation Results",
|
| 723 |
+
datatype=["str", "str", "str", "str", "str"]
|
| 724 |
)
|
| 725 |
infra_btn.click(
|
| 726 |
fn=handle_infra_with_governance,
|
|
|
|
| 733 |
outputs=[batch_results]
|
| 734 |
)
|
| 735 |
|
|
|
|
| 736 |
# Deep Analysis (MCMC)
|
|
|
|
| 737 |
with gr.TabItem("Deep Analysis (MCMC)"):
|
| 738 |
gr.Markdown("### Markov Chain Monte Carlo (Metropolis‑Hastings)")
|
| 739 |
with gr.Row():
|
|
|
|
| 752 |
outputs=[hmc_summary, hmc_trace_plot, hmc_pair_plot]
|
| 753 |
)
|
| 754 |
|
| 755 |
+
# Policy Management
|
|
|
|
|
|
|
| 756 |
with gr.TabItem("Policy Management"):
|
| 757 |
gr.Markdown("### 📋 Execution Policies")
|
| 758 |
with gr.Row():
|
|
|
|
| 781 |
outputs=[policy_display]
|
| 782 |
)
|
| 783 |
|
| 784 |
+
# Enterprise / OSS
|
|
|
|
|
|
|
| 785 |
with gr.TabItem("Enterprise / OSS"):
|
| 786 |
gr.Markdown(f"""
|
| 787 |
<div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); padding: 2rem; border-radius: 12px; margin-bottom: 2rem; text-align: center; color: white;">
|
|
|
|
| 830 |
<a href="mailto:petter2025us@outlook.com" style="background: #667eea; color: white; padding: 12px 24px; text-decoration: none; border-radius: 8px; font-weight: bold;">📧 Contact Sales</a>
|
| 831 |
</div>
|
| 832 |
""")
|
|
|
|
| 833 |
gr.Markdown("### 📥 Export Decision History")
|
| 834 |
+
export_btn = gr.DownloadButton("Download CSV", variant="primary")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 835 |
export_btn.click(
|
| 836 |
+
fn=export_history_to_csv,
|
| 837 |
outputs=[gr.File(label="decision_history.csv")]
|
| 838 |
)
|
| 839 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 840 |
# ----------------------------------------------------------------------
|
| 841 |
# Launch
|
| 842 |
# ----------------------------------------------------------------------
|