petter2025 commited on
Commit
d9d84fd
·
verified ·
1 Parent(s): 5b9eca0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +50 -117
app.py CHANGED
@@ -15,6 +15,7 @@ import sys
15
  import functools
16
  import csv
17
  import io
 
18
  from collections import deque
19
  from scipy.stats import beta
20
  import plotly.graph_objects as go
@@ -59,25 +60,22 @@ os.makedirs("/var/log/arf", exist_ok=True)
59
  logger = logging.getLogger(__name__)
60
  logger.setLevel(getattr(logging, LOG_LEVEL, logging.INFO))
61
 
62
- # File handler with rotation
63
  file_handler = logging.handlers.RotatingFileHandler(
64
  "/var/log/arf/app.log", maxBytes=10_485_760, backupCount=5
65
  )
66
  file_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
67
 
68
- # Console handler (for Docker logs)
69
  console_handler = logging.StreamHandler(sys.stdout)
70
  console_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
71
 
72
  logger.addHandler(file_handler)
73
  logger.addHandler(console_handler)
74
- logger.propagate = False # Prevent duplicate logs
75
 
76
  # ----------------------------------------------------------------------
77
  # SQLite persistence with secure permissions
78
  # ----------------------------------------------------------------------
79
  def init_db():
80
- """Create the decisions table with secure file permissions."""
81
  db_dir = os.path.dirname(DB_PATH)
82
  if db_dir and not os.path.exists(db_dir):
83
  os.makedirs(db_dir, exist_ok=True)
@@ -92,7 +90,6 @@ def init_db():
92
  )
93
  ''')
94
  conn.commit()
95
- # Restrict permissions (owner read/write only) – best effort
96
  try:
97
  os.chmod(DB_PATH, 0o600)
98
  except Exception as e:
@@ -100,7 +97,6 @@ def init_db():
100
  logger.info(f"Database initialized at {DB_PATH}")
101
 
102
  def save_decision_to_db(decision: dict, risk: float):
103
- """Insert a decision into the database."""
104
  try:
105
  with contextlib.closing(sqlite3.connect(DB_PATH)) as conn:
106
  cursor = conn.cursor()
@@ -113,7 +109,6 @@ def save_decision_to_db(decision: dict, risk: float):
113
  logger.error(f"Failed to save decision to DB: {e}")
114
 
115
  def load_recent_decisions(limit: int = 100) -> List[Tuple[str, dict, float]]:
116
- """Load the most recent decisions from the database."""
117
  decisions = []
118
  try:
119
  with contextlib.closing(sqlite3.connect(DB_PATH)) as conn:
@@ -125,13 +120,12 @@ def load_recent_decisions(limit: int = 100) -> List[Tuple[str, dict, float]]:
125
  rows = cursor.fetchall()
126
  for ts, json_str, risk in rows:
127
  decisions.append((ts, json.loads(json_str), risk))
128
- decisions.reverse() # oldest first
129
  except Exception as e:
130
  logger.error(f"Failed to load decisions from DB: {e}")
131
  return decisions
132
 
133
  def vacuum_db():
134
- """Run VACUUM on the database (periodic maintenance)."""
135
  try:
136
  with contextlib.closing(sqlite3.connect(DB_PATH)) as conn:
137
  conn.execute("VACUUM")
@@ -154,7 +148,7 @@ else:
154
  prom_mcmc_runs = None
155
 
156
  # ----------------------------------------------------------------------
157
- # Thread‑safe history (in‑memory + DB backup)
158
  # ----------------------------------------------------------------------
159
  decision_history = []
160
  risk_history = []
@@ -162,23 +156,19 @@ history_lock = threading.Lock()
162
  shutdown_event = threading.Event()
163
 
164
  def update_dashboard_data(decision: dict, risk: float):
165
- """Thread‑safe update of both in‑memory history and database."""
166
  with history_lock:
167
  decision_history.append((datetime.utcnow().isoformat(), decision, risk))
168
  risk_history.append((datetime.utcnow().isoformat(), risk))
169
- # Keep only last 100 in memory
170
  if len(decision_history) > 100:
171
  decision_history.pop(0)
172
  if len(risk_history) > 100:
173
  risk_history.pop(0)
174
  save_decision_to_db(decision, risk)
175
- # Update Prometheus metrics
176
  if PROMETHEUS_AVAILABLE:
177
  prom_decisions_total.labels(action=decision.get("risk_level", "unknown")).inc()
178
  prom_risk_gauge.set(risk)
179
 
180
  def refresh_history_from_db():
181
- """Load recent history from database (called at startup)."""
182
  global decision_history, risk_history
183
  decisions = load_recent_decisions(100)
184
  with history_lock:
@@ -187,15 +177,13 @@ def refresh_history_from_db():
187
  for ts, dec, risk in decisions:
188
  decision_history.append((ts, dec, risk))
189
  risk_history.append((ts, risk))
190
- # After loading, set the Prometheus gauge to the latest risk
191
  if PROMETHEUS_AVAILABLE and risk_history:
192
  prom_risk_gauge.set(risk_history[-1][1])
193
 
194
  # ----------------------------------------------------------------------
195
- # Memory monitoring (daemon thread with graceful stop)
196
  # ----------------------------------------------------------------------
197
  def get_memory_usage():
198
- """Return current process memory usage in MB (RSS)."""
199
  try:
200
  import resource
201
  rss = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
@@ -216,7 +204,6 @@ def get_memory_usage():
216
  return None
217
 
218
  def memory_monitor_loop():
219
- """Periodically log memory usage. Runs in a daemon thread."""
220
  while not shutdown_event.is_set():
221
  try:
222
  mem_mb = get_memory_usage()
@@ -226,40 +213,30 @@ def memory_monitor_loop():
226
  logger.info("Process memory: unknown")
227
  except Exception as e:
228
  logger.error(f"Memory logging error: {e}")
229
- # Sleep in small intervals to react quickly to shutdown
230
  for _ in range(60):
231
  if shutdown_event.is_set():
232
  break
233
  time.sleep(1)
234
 
235
  # ----------------------------------------------------------------------
236
- # Bayesian Risk Engine (Beta‑Binomial) with sliding window
237
  # ----------------------------------------------------------------------
238
  class BayesianRiskEngine:
239
  def __init__(self, alpha=ALPHA_PRIOR, beta=BETA_PRIOR, maxlen=None):
240
  self.alpha = alpha
241
  self.beta = beta
242
  self.maxlen = maxlen
243
- self.events = deque(maxlen=maxlen) # store (failures, successes)
244
  self.total_failures = 0
245
  self.total_successes = 0
246
 
247
  def update(self, failures, successes):
248
- # Add new event
249
  self.events.append((failures, successes))
250
  self.total_failures += failures
251
  self.total_successes += successes
252
-
253
- # If maxlen is reached and the queue overflows, we've already removed the oldest,
254
- # but we need to subtract it from totals.
255
  if self.maxlen is not None and len(self.events) == self.maxlen:
256
- # The deque automatically discards the leftmost when full, but we have to
257
- # manually adjust totals to reflect the discarded event.
258
- # However, we can't easily know what was discarded. Instead, recompute from deque.
259
  self.total_failures = sum(f for f, _ in self.events)
260
  self.total_successes = sum(s for _, s in self.events)
261
-
262
- # Set alpha,beta = prior + totals
263
  self.alpha = ALPHA_PRIOR + self.total_failures
264
  self.beta = BETA_PRIOR + self.total_successes
265
 
@@ -272,7 +249,7 @@ class BayesianRiskEngine:
272
  return lo, hi
273
 
274
  # ----------------------------------------------------------------------
275
- # Policy Engine (uses global thresholds)
276
  # ----------------------------------------------------------------------
277
  class PolicyEngine:
278
  def __init__(self):
@@ -287,16 +264,15 @@ class PolicyEngine:
287
  return "escalate", f"Risk in escalation zone ({self.thresholds['low']}-{self.thresholds['high']})"
288
 
289
  # ----------------------------------------------------------------------
290
- # Infrastructure analysis (synchronous, with validation and sliding window)
291
  # ----------------------------------------------------------------------
292
  def handle_infra_with_governance(fault_type: str, context_window: int, session_state: dict):
293
  start_time = time.time()
294
  try:
295
- # Input validation
296
  fault_type = fault_type.strip()
297
  if fault_type not in ["none", "switch_down", "server_overload", "cascade"]:
298
  fault_type = "none"
299
- context_window = max(0, min(1000, int(context_window))) # clamp
300
 
301
  fault_map = {
302
  "none": (1, 99),
@@ -306,7 +282,6 @@ def handle_infra_with_governance(fault_type: str, context_window: int, session_s
306
  }
307
  failures, successes = fault_map.get(fault_type, (1, 99))
308
 
309
- # Use context_window: if >0, limit to last N events; else unlimited
310
  maxlen = context_window if context_window > 0 else None
311
  risk_engine = BayesianRiskEngine(maxlen=maxlen)
312
  risk_engine.update(failures, successes)
@@ -340,7 +315,6 @@ def handle_infra_with_governance(fault_type: str, context_window: int, session_s
340
  "control_plane_decision": control_decision
341
  }
342
  }
343
- # Record latency metric
344
  if PROMETHEUS_AVAILABLE:
345
  prom_decision_latency.observe(time.time() - start_time)
346
  return output, session_state
@@ -350,7 +324,6 @@ def handle_infra_with_governance(fault_type: str, context_window: int, session_s
350
 
351
  def autonomous_control_decision(risk, risk_engine, policy_engine):
352
  action, reason = policy_engine.evaluate(risk)
353
- # Use configurable thresholds for risk level
354
  risk_level = "low" if risk < LOW_THRESHOLD else "medium" if risk < HIGH_THRESHOLD else "high"
355
  decision = {
356
  "timestamp": datetime.utcnow().isoformat(),
@@ -363,7 +336,7 @@ def autonomous_control_decision(risk, risk_engine, policy_engine):
363
  return decision
364
 
365
  # ----------------------------------------------------------------------
366
- # MCMC (Metropolis‑Hastings) with input validation and timeout
367
  # ----------------------------------------------------------------------
368
  class MHMCMC:
369
  def __init__(self, log_target, proposal_sd=0.1):
@@ -390,18 +363,16 @@ class MHMCMC:
390
 
391
  def run_hmc_mcmc(samples: int, warmup: int):
392
  try:
393
- # Input validation
394
  samples = max(500, min(10000, int(samples)))
395
  warmup = max(100, min(2000, int(warmup)))
396
  if PROMETHEUS_AVAILABLE:
397
- prom_mcmc_runs.inc() # record metric
398
 
399
- # Generate data: 10 observations with mean 0.5, std 0.2
400
- np.random.seed(42) # for reproducibility
401
  data = np.random.normal(0.5, 0.2, 10)
402
 
403
  def log_prior(mu):
404
- return -0.5 * (mu ** 2) # prior N(0,1)
405
 
406
  def log_likelihood(mu):
407
  return -0.5 * np.sum(((data - mu) / 0.2) ** 2)
@@ -437,7 +408,7 @@ def run_hmc_mcmc(samples: int, warmup: int):
437
  return {"error": str(e)}, go.Figure(), go.Figure()
438
 
439
  # ----------------------------------------------------------------------
440
- # Dashboard plots (thread‑safe with caching)
441
  # ----------------------------------------------------------------------
442
  class TTLCache:
443
  def __init__(self, ttl_seconds=5):
@@ -461,7 +432,7 @@ class TTLCache:
461
  return result
462
  return wrapper
463
 
464
- dashboard_cache = TTLCache(ttl_seconds=2) # cache for 2 seconds
465
 
466
  @dashboard_cache
467
  def generate_risk_gauge():
@@ -509,7 +480,6 @@ def generate_action_timeline():
509
 
510
  @dashboard_cache
511
  def generate_risk_trend():
512
- """Line chart showing risk over time."""
513
  with history_lock:
514
  if not risk_history:
515
  return go.Figure()
@@ -517,7 +487,6 @@ def generate_risk_trend():
517
  risks = [r for _, r in risk_history]
518
  fig = go.Figure()
519
  fig.add_trace(go.Scatter(x=times, y=risks, mode='lines+markers', name='Risk', line=dict(color='red', width=2)))
520
- # Add horizontal lines for thresholds
521
  fig.add_hline(y=LOW_THRESHOLD, line_dash="dash", line_color="green", annotation_text=f"Low ({LOW_THRESHOLD})")
522
  fig.add_hline(y=HIGH_THRESHOLD, line_dash="dash", line_color="orange", annotation_text=f"High ({HIGH_THRESHOLD})")
523
  fig.update_layout(title="Risk Trend", xaxis_title="Time", yaxis_title="Risk Score", yaxis_range=[0, 1])
@@ -547,45 +516,40 @@ def refresh_dashboard():
547
  # Batch simulation
548
  # ----------------------------------------------------------------------
549
  def run_batch_simulation(context_window: int):
550
- """Run evaluation for all fault types and return a summary table and individual results."""
551
  fault_types = ["none", "switch_down", "server_overload", "cascade"]
552
  results = []
553
  for fault in fault_types:
554
- # We'll call handle_infra_with_governance directly
555
- # Since it returns (output, state) and we ignore state
556
  output, _ = handle_infra_with_governance(fault, context_window, {})
557
  if "error" in output:
558
- results.append({
559
- "Fault Type": fault,
560
- "Risk": "Error",
561
- "Decision": output["error"],
562
- "Risk Level": "N/A",
563
- "Confidence Interval": "N/A"
564
- })
565
  else:
566
- results.append({
567
- "Fault Type": fault,
568
- "Risk": f"{output['risk']:.4f}",
569
- "Decision": output["decision"],
570
- "Risk Level": output["governance"]["control_plane_decision"]["risk_level"],
571
- "Confidence Interval": f"[{output['risk_ci'][0]:.3f}, {output['risk_ci'][1]:.3f}]"
572
- })
573
- # Convert to DataFrame-like display
574
- summary_table = results
575
- return summary_table
576
 
577
  # ----------------------------------------------------------------------
578
  # Data export
579
  # ----------------------------------------------------------------------
580
  def export_history_to_csv():
581
- """Generate CSV of all decisions from database."""
582
  try:
583
  with contextlib.closing(sqlite3.connect(DB_PATH)) as conn:
584
  cursor = conn.cursor()
585
  cursor.execute("SELECT timestamp, decision_json, risk FROM decisions ORDER BY timestamp")
586
  rows = cursor.fetchall()
587
  if not rows:
588
- return "No data to export", None
589
  output = io.StringIO()
590
  writer = csv.writer(output)
591
  writer.writerow(["Timestamp", "Decision", "Risk", "Approved", "Risk Level", "Reason"])
@@ -600,27 +564,29 @@ def export_history_to_csv():
600
  dec.get("reason", "")
601
  ])
602
  output.seek(0)
603
- return output.getvalue()
 
 
 
604
  except Exception as e:
605
  logger.error(f"Export failed: {e}")
606
- return f"Export failed: {str(e)}", None
607
 
608
  # ----------------------------------------------------------------------
609
- # Update thresholds (global)
610
  # ----------------------------------------------------------------------
611
  def update_thresholds(low: float, high: float):
612
  global LOW_THRESHOLD, HIGH_THRESHOLD
613
  if 0 <= low < high <= 1:
614
  LOW_THRESHOLD = low
615
  HIGH_THRESHOLD = high
616
- # Also update PolicyEngine thresholds (but PolicyEngine reads from globals, so fine)
617
  logger.info(f"Updated thresholds: low={low}, high={high}")
618
  return f"Thresholds updated: approve < {low}, escalate {low}-{high}, deny > {high}"
619
  else:
620
  return f"Invalid thresholds: low={low}, high={high}. Must satisfy 0 ≤ low < high ≤ 1."
621
 
622
  # ----------------------------------------------------------------------
623
- # OSS capabilities (mocked)
624
  # ----------------------------------------------------------------------
625
  oss_caps = {
626
  "edition": "OSS (Demo)",
@@ -683,10 +649,8 @@ with gr.Blocks(title=f"ARF v{VERSION} – Bayesian Risk Scoring Demo", theme=gr.
683
  All components are implemented with only `numpy`, `scipy`, and standard libraries.
684
  """)
685
 
686
- # ------------------------------------------------------------------
687
- # Control Plane Dashboard with auto-refresh
688
- # ------------------------------------------------------------------
689
  with gr.Tabs():
 
690
  with gr.TabItem("Control Plane Dashboard"):
691
  gr.Markdown("### 🎮 Control Plane")
692
  with gr.Row():
@@ -715,24 +679,20 @@ with gr.Blocks(title=f"ARF v{VERSION} – Bayesian Risk Scoring Demo", theme=gr.
715
  with gr.Row():
716
  auto_refresh = gr.Checkbox(label="Auto-refresh (3s)", value=False)
717
  refresh_btn = gr.Button("Refresh Now")
718
- # Auto-refresh timer
719
- timer = gr.Timer(value=3, active=False) # will be toggled
720
  def refresh_if_enabled(auto):
721
  if auto:
722
  return refresh_dashboard()
723
  else:
724
- return [gr.update() for _ in range(5)] # no update
725
  timer.tick(refresh_if_enabled, inputs=[auto_refresh], outputs=[control_stats, risk_gauge, decision_pie, action_timeline, risk_trend])
726
  refresh_btn.click(
727
  fn=refresh_dashboard,
728
  outputs=[control_stats, risk_gauge, decision_pie, action_timeline, risk_trend]
729
  )
730
- # Start/stop timer based on checkbox
731
  auto_refresh.change(lambda v: gr.Timer(active=v), inputs=[auto_refresh], outputs=[timer])
732
 
733
- # ------------------------------------------------------------------
734
- # Infrastructure Reliability (with batch simulation)
735
- # ------------------------------------------------------------------
736
  with gr.TabItem("Infrastructure Reliability"):
737
  gr.Markdown("### 🏗️ Infrastructure Intent Evaluation with Bayesian Risk")
738
  infra_state = gr.State(value={})
@@ -759,7 +719,8 @@ with gr.Blocks(title=f"ARF v{VERSION} – Bayesian Risk Scoring Demo", theme=gr.
759
  infra_output = gr.JSON(label="Analysis Result")
760
  batch_results = gr.Dataframe(
761
  headers=["Fault Type", "Risk", "Decision", "Risk Level", "Confidence Interval"],
762
- label="Batch Simulation Results"
 
763
  )
764
  infra_btn.click(
765
  fn=handle_infra_with_governance,
@@ -772,9 +733,7 @@ with gr.Blocks(title=f"ARF v{VERSION} – Bayesian Risk Scoring Demo", theme=gr.
772
  outputs=[batch_results]
773
  )
774
 
775
- # ------------------------------------------------------------------
776
  # Deep Analysis (MCMC)
777
- # ------------------------------------------------------------------
778
  with gr.TabItem("Deep Analysis (MCMC)"):
779
  gr.Markdown("### Markov Chain Monte Carlo (Metropolis‑Hastings)")
780
  with gr.Row():
@@ -793,9 +752,7 @@ with gr.Blocks(title=f"ARF v{VERSION} – Bayesian Risk Scoring Demo", theme=gr.
793
  outputs=[hmc_summary, hmc_trace_plot, hmc_pair_plot]
794
  )
795
 
796
- # ------------------------------------------------------------------
797
- # Policy Management (with interactive sliders)
798
- # ------------------------------------------------------------------
799
  with gr.TabItem("Policy Management"):
800
  gr.Markdown("### 📋 Execution Policies")
801
  with gr.Row():
@@ -824,9 +781,7 @@ with gr.Blocks(title=f"ARF v{VERSION} – Bayesian Risk Scoring Demo", theme=gr.
824
  outputs=[policy_display]
825
  )
826
 
827
- # ------------------------------------------------------------------
828
- # Enterprise / OSS (with data export)
829
- # ------------------------------------------------------------------
830
  with gr.TabItem("Enterprise / OSS"):
831
  gr.Markdown(f"""
832
  <div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); padding: 2rem; border-radius: 12px; margin-bottom: 2rem; text-align: center; color: white;">
@@ -875,35 +830,13 @@ with gr.Blocks(title=f"ARF v{VERSION} – Bayesian Risk Scoring Demo", theme=gr.
875
  <a href="mailto:petter2025us@outlook.com" style="background: #667eea; color: white; padding: 12px 24px; text-decoration: none; border-radius: 8px; font-weight: bold;">📧 Contact Sales</a>
876
  </div>
877
  """)
878
- # Data export section
879
  gr.Markdown("### 📥 Export Decision History")
880
- with gr.Row():
881
- export_btn = gr.DownloadButton("Download CSV", variant="primary")
882
- export_btn.click(
883
- fn=export_history_to_csv,
884
- outputs=[gr.File(label="decision_history.csv", visible=False)] # hidden, but we need a file component
885
- )
886
- # Note: gradio DownloadButton works with a function that returns a file path or bytes.
887
- # We'll create a temporary file.
888
- def export_and_return_file():
889
- csv_data = export_history_to_csv()
890
- if csv_data and not csv_data.startswith("Export failed"):
891
- # Write to a temporary file
892
- import tempfile
893
- with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) as f:
894
- f.write(csv_data)
895
- return f.name
896
- else:
897
- return None
898
  export_btn.click(
899
- fn=export_and_return_file,
900
  outputs=[gr.File(label="decision_history.csv")]
901
  )
902
 
903
- # ------------------------------------------------------------------
904
- # Wire events for infra and MCMC (already done above)
905
- # ------------------------------------------------------------------
906
-
907
  # ----------------------------------------------------------------------
908
  # Launch
909
  # ----------------------------------------------------------------------
 
15
  import functools
16
  import csv
17
  import io
18
+ import tempfile
19
  from collections import deque
20
  from scipy.stats import beta
21
  import plotly.graph_objects as go
 
60
  logger = logging.getLogger(__name__)
61
  logger.setLevel(getattr(logging, LOG_LEVEL, logging.INFO))
62
 
 
63
  file_handler = logging.handlers.RotatingFileHandler(
64
  "/var/log/arf/app.log", maxBytes=10_485_760, backupCount=5
65
  )
66
  file_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
67
 
 
68
  console_handler = logging.StreamHandler(sys.stdout)
69
  console_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
70
 
71
  logger.addHandler(file_handler)
72
  logger.addHandler(console_handler)
73
+ logger.propagate = False
74
 
75
  # ----------------------------------------------------------------------
76
  # SQLite persistence with secure permissions
77
  # ----------------------------------------------------------------------
78
  def init_db():
 
79
  db_dir = os.path.dirname(DB_PATH)
80
  if db_dir and not os.path.exists(db_dir):
81
  os.makedirs(db_dir, exist_ok=True)
 
90
  )
91
  ''')
92
  conn.commit()
 
93
  try:
94
  os.chmod(DB_PATH, 0o600)
95
  except Exception as e:
 
97
  logger.info(f"Database initialized at {DB_PATH}")
98
 
99
  def save_decision_to_db(decision: dict, risk: float):
 
100
  try:
101
  with contextlib.closing(sqlite3.connect(DB_PATH)) as conn:
102
  cursor = conn.cursor()
 
109
  logger.error(f"Failed to save decision to DB: {e}")
110
 
111
  def load_recent_decisions(limit: int = 100) -> List[Tuple[str, dict, float]]:
 
112
  decisions = []
113
  try:
114
  with contextlib.closing(sqlite3.connect(DB_PATH)) as conn:
 
120
  rows = cursor.fetchall()
121
  for ts, json_str, risk in rows:
122
  decisions.append((ts, json.loads(json_str), risk))
123
+ decisions.reverse()
124
  except Exception as e:
125
  logger.error(f"Failed to load decisions from DB: {e}")
126
  return decisions
127
 
128
  def vacuum_db():
 
129
  try:
130
  with contextlib.closing(sqlite3.connect(DB_PATH)) as conn:
131
  conn.execute("VACUUM")
 
148
  prom_mcmc_runs = None
149
 
150
  # ----------------------------------------------------------------------
151
+ # Thread‑safe history
152
  # ----------------------------------------------------------------------
153
  decision_history = []
154
  risk_history = []
 
156
  shutdown_event = threading.Event()
157
 
158
  def update_dashboard_data(decision: dict, risk: float):
 
159
  with history_lock:
160
  decision_history.append((datetime.utcnow().isoformat(), decision, risk))
161
  risk_history.append((datetime.utcnow().isoformat(), risk))
 
162
  if len(decision_history) > 100:
163
  decision_history.pop(0)
164
  if len(risk_history) > 100:
165
  risk_history.pop(0)
166
  save_decision_to_db(decision, risk)
 
167
  if PROMETHEUS_AVAILABLE:
168
  prom_decisions_total.labels(action=decision.get("risk_level", "unknown")).inc()
169
  prom_risk_gauge.set(risk)
170
 
171
  def refresh_history_from_db():
 
172
  global decision_history, risk_history
173
  decisions = load_recent_decisions(100)
174
  with history_lock:
 
177
  for ts, dec, risk in decisions:
178
  decision_history.append((ts, dec, risk))
179
  risk_history.append((ts, risk))
 
180
  if PROMETHEUS_AVAILABLE and risk_history:
181
  prom_risk_gauge.set(risk_history[-1][1])
182
 
183
  # ----------------------------------------------------------------------
184
+ # Memory monitoring
185
  # ----------------------------------------------------------------------
186
  def get_memory_usage():
 
187
  try:
188
  import resource
189
  rss = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
 
204
  return None
205
 
206
  def memory_monitor_loop():
 
207
  while not shutdown_event.is_set():
208
  try:
209
  mem_mb = get_memory_usage()
 
213
  logger.info("Process memory: unknown")
214
  except Exception as e:
215
  logger.error(f"Memory logging error: {e}")
 
216
  for _ in range(60):
217
  if shutdown_event.is_set():
218
  break
219
  time.sleep(1)
220
 
221
  # ----------------------------------------------------------------------
222
+ # Bayesian Risk Engine
223
  # ----------------------------------------------------------------------
224
  class BayesianRiskEngine:
225
  def __init__(self, alpha=ALPHA_PRIOR, beta=BETA_PRIOR, maxlen=None):
226
  self.alpha = alpha
227
  self.beta = beta
228
  self.maxlen = maxlen
229
+ self.events = deque(maxlen=maxlen)
230
  self.total_failures = 0
231
  self.total_successes = 0
232
 
233
  def update(self, failures, successes):
 
234
  self.events.append((failures, successes))
235
  self.total_failures += failures
236
  self.total_successes += successes
 
 
 
237
  if self.maxlen is not None and len(self.events) == self.maxlen:
 
 
 
238
  self.total_failures = sum(f for f, _ in self.events)
239
  self.total_successes = sum(s for _, s in self.events)
 
 
240
  self.alpha = ALPHA_PRIOR + self.total_failures
241
  self.beta = BETA_PRIOR + self.total_successes
242
 
 
249
  return lo, hi
250
 
251
  # ----------------------------------------------------------------------
252
+ # Policy Engine
253
  # ----------------------------------------------------------------------
254
  class PolicyEngine:
255
  def __init__(self):
 
264
  return "escalate", f"Risk in escalation zone ({self.thresholds['low']}-{self.thresholds['high']})"
265
 
266
  # ----------------------------------------------------------------------
267
+ # Infrastructure analysis
268
  # ----------------------------------------------------------------------
269
  def handle_infra_with_governance(fault_type: str, context_window: int, session_state: dict):
270
  start_time = time.time()
271
  try:
 
272
  fault_type = fault_type.strip()
273
  if fault_type not in ["none", "switch_down", "server_overload", "cascade"]:
274
  fault_type = "none"
275
+ context_window = max(0, min(1000, int(context_window)))
276
 
277
  fault_map = {
278
  "none": (1, 99),
 
282
  }
283
  failures, successes = fault_map.get(fault_type, (1, 99))
284
 
 
285
  maxlen = context_window if context_window > 0 else None
286
  risk_engine = BayesianRiskEngine(maxlen=maxlen)
287
  risk_engine.update(failures, successes)
 
315
  "control_plane_decision": control_decision
316
  }
317
  }
 
318
  if PROMETHEUS_AVAILABLE:
319
  prom_decision_latency.observe(time.time() - start_time)
320
  return output, session_state
 
324
 
325
  def autonomous_control_decision(risk, risk_engine, policy_engine):
326
  action, reason = policy_engine.evaluate(risk)
 
327
  risk_level = "low" if risk < LOW_THRESHOLD else "medium" if risk < HIGH_THRESHOLD else "high"
328
  decision = {
329
  "timestamp": datetime.utcnow().isoformat(),
 
336
  return decision
337
 
338
  # ----------------------------------------------------------------------
339
+ # MCMC
340
  # ----------------------------------------------------------------------
341
  class MHMCMC:
342
  def __init__(self, log_target, proposal_sd=0.1):
 
363
 
364
  def run_hmc_mcmc(samples: int, warmup: int):
365
  try:
 
366
  samples = max(500, min(10000, int(samples)))
367
  warmup = max(100, min(2000, int(warmup)))
368
  if PROMETHEUS_AVAILABLE:
369
+ prom_mcmc_runs.inc()
370
 
371
+ np.random.seed(42)
 
372
  data = np.random.normal(0.5, 0.2, 10)
373
 
374
  def log_prior(mu):
375
+ return -0.5 * (mu ** 2)
376
 
377
  def log_likelihood(mu):
378
  return -0.5 * np.sum(((data - mu) / 0.2) ** 2)
 
408
  return {"error": str(e)}, go.Figure(), go.Figure()
409
 
410
  # ----------------------------------------------------------------------
411
+ # Dashboard plots
412
  # ----------------------------------------------------------------------
413
  class TTLCache:
414
  def __init__(self, ttl_seconds=5):
 
432
  return result
433
  return wrapper
434
 
435
+ dashboard_cache = TTLCache(ttl_seconds=2)
436
 
437
  @dashboard_cache
438
  def generate_risk_gauge():
 
480
 
481
  @dashboard_cache
482
  def generate_risk_trend():
 
483
  with history_lock:
484
  if not risk_history:
485
  return go.Figure()
 
487
  risks = [r for _, r in risk_history]
488
  fig = go.Figure()
489
  fig.add_trace(go.Scatter(x=times, y=risks, mode='lines+markers', name='Risk', line=dict(color='red', width=2)))
 
490
  fig.add_hline(y=LOW_THRESHOLD, line_dash="dash", line_color="green", annotation_text=f"Low ({LOW_THRESHOLD})")
491
  fig.add_hline(y=HIGH_THRESHOLD, line_dash="dash", line_color="orange", annotation_text=f"High ({HIGH_THRESHOLD})")
492
  fig.update_layout(title="Risk Trend", xaxis_title="Time", yaxis_title="Risk Score", yaxis_range=[0, 1])
 
516
  # Batch simulation
517
  # ----------------------------------------------------------------------
518
  def run_batch_simulation(context_window: int):
 
519
  fault_types = ["none", "switch_down", "server_overload", "cascade"]
520
  results = []
521
  for fault in fault_types:
 
 
522
  output, _ = handle_infra_with_governance(fault, context_window, {})
523
  if "error" in output:
524
+ results.append([
525
+ fault,
526
+ "Error",
527
+ output["error"],
528
+ "N/A",
529
+ "N/A"
530
+ ])
531
  else:
532
+ results.append([
533
+ fault,
534
+ f"{output['risk']:.4f}",
535
+ output["decision"],
536
+ output["governance"]["control_plane_decision"]["risk_level"],
537
+ f"[{output['risk_ci'][0]:.3f}, {output['risk_ci'][1]:.3f}]"
538
+ ])
539
+ # Return as list of lists for gr.Dataframe
540
+ return results
 
541
 
542
  # ----------------------------------------------------------------------
543
  # Data export
544
  # ----------------------------------------------------------------------
545
  def export_history_to_csv():
 
546
  try:
547
  with contextlib.closing(sqlite3.connect(DB_PATH)) as conn:
548
  cursor = conn.cursor()
549
  cursor.execute("SELECT timestamp, decision_json, risk FROM decisions ORDER BY timestamp")
550
  rows = cursor.fetchall()
551
  if not rows:
552
+ return None
553
  output = io.StringIO()
554
  writer = csv.writer(output)
555
  writer.writerow(["Timestamp", "Decision", "Risk", "Approved", "Risk Level", "Reason"])
 
564
  dec.get("reason", "")
565
  ])
566
  output.seek(0)
567
+ # Write to a temporary file
568
+ with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) as f:
569
+ f.write(output.getvalue())
570
+ return f.name
571
  except Exception as e:
572
  logger.error(f"Export failed: {e}")
573
+ return None
574
 
575
  # ----------------------------------------------------------------------
576
+ # Update thresholds
577
  # ----------------------------------------------------------------------
578
  def update_thresholds(low: float, high: float):
579
  global LOW_THRESHOLD, HIGH_THRESHOLD
580
  if 0 <= low < high <= 1:
581
  LOW_THRESHOLD = low
582
  HIGH_THRESHOLD = high
 
583
  logger.info(f"Updated thresholds: low={low}, high={high}")
584
  return f"Thresholds updated: approve < {low}, escalate {low}-{high}, deny > {high}"
585
  else:
586
  return f"Invalid thresholds: low={low}, high={high}. Must satisfy 0 ≤ low < high ≤ 1."
587
 
588
  # ----------------------------------------------------------------------
589
+ # OSS capabilities
590
  # ----------------------------------------------------------------------
591
  oss_caps = {
592
  "edition": "OSS (Demo)",
 
649
  All components are implemented with only `numpy`, `scipy`, and standard libraries.
650
  """)
651
 
 
 
 
652
  with gr.Tabs():
653
+ # Control Plane Dashboard
654
  with gr.TabItem("Control Plane Dashboard"):
655
  gr.Markdown("### 🎮 Control Plane")
656
  with gr.Row():
 
679
  with gr.Row():
680
  auto_refresh = gr.Checkbox(label="Auto-refresh (3s)", value=False)
681
  refresh_btn = gr.Button("Refresh Now")
682
+ timer = gr.Timer(value=3, active=False)
 
683
  def refresh_if_enabled(auto):
684
  if auto:
685
  return refresh_dashboard()
686
  else:
687
+ return [gr.update() for _ in range(5)]
688
  timer.tick(refresh_if_enabled, inputs=[auto_refresh], outputs=[control_stats, risk_gauge, decision_pie, action_timeline, risk_trend])
689
  refresh_btn.click(
690
  fn=refresh_dashboard,
691
  outputs=[control_stats, risk_gauge, decision_pie, action_timeline, risk_trend]
692
  )
 
693
  auto_refresh.change(lambda v: gr.Timer(active=v), inputs=[auto_refresh], outputs=[timer])
694
 
695
+ # Infrastructure Reliability
 
 
696
  with gr.TabItem("Infrastructure Reliability"):
697
  gr.Markdown("### 🏗️ Infrastructure Intent Evaluation with Bayesian Risk")
698
  infra_state = gr.State(value={})
 
719
  infra_output = gr.JSON(label="Analysis Result")
720
  batch_results = gr.Dataframe(
721
  headers=["Fault Type", "Risk", "Decision", "Risk Level", "Confidence Interval"],
722
+ label="Batch Simulation Results",
723
+ datatype=["str", "str", "str", "str", "str"]
724
  )
725
  infra_btn.click(
726
  fn=handle_infra_with_governance,
 
733
  outputs=[batch_results]
734
  )
735
 
 
736
  # Deep Analysis (MCMC)
 
737
  with gr.TabItem("Deep Analysis (MCMC)"):
738
  gr.Markdown("### Markov Chain Monte Carlo (Metropolis‑Hastings)")
739
  with gr.Row():
 
752
  outputs=[hmc_summary, hmc_trace_plot, hmc_pair_plot]
753
  )
754
 
755
+ # Policy Management
 
 
756
  with gr.TabItem("Policy Management"):
757
  gr.Markdown("### 📋 Execution Policies")
758
  with gr.Row():
 
781
  outputs=[policy_display]
782
  )
783
 
784
+ # Enterprise / OSS
 
 
785
  with gr.TabItem("Enterprise / OSS"):
786
  gr.Markdown(f"""
787
  <div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); padding: 2rem; border-radius: 12px; margin-bottom: 2rem; text-align: center; color: white;">
 
830
  <a href="mailto:petter2025us@outlook.com" style="background: #667eea; color: white; padding: 12px 24px; text-decoration: none; border-radius: 8px; font-weight: bold;">📧 Contact Sales</a>
831
  </div>
832
  """)
 
833
  gr.Markdown("### 📥 Export Decision History")
834
+ export_btn = gr.DownloadButton("Download CSV", variant="primary")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
835
  export_btn.click(
836
+ fn=export_history_to_csv,
837
  outputs=[gr.File(label="decision_history.csv")]
838
  )
839
 
 
 
 
 
840
  # ----------------------------------------------------------------------
841
  # Launch
842
  # ----------------------------------------------------------------------