petter2025 commited on
Commit
42fb0d1
·
verified ·
1 Parent(s): 1734dc9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +69 -85
app.py CHANGED
@@ -1,96 +1,80 @@
1
- import gradio as gr
2
- import sqlite3
3
  import time
4
- from datetime import datetime
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
- DB_PATH = "reliability.db"
 
 
 
 
 
 
 
7
 
8
- # --- Setup database (first run only) ---
9
- def init_db():
10
- conn = sqlite3.connect(DB_PATH)
11
- c = conn.cursor()
12
- c.execute("""
13
- CREATE TABLE IF NOT EXISTS telemetry (
14
- id INTEGER PRIMARY KEY AUTOINCREMENT,
15
- timestamp TEXT,
16
- component TEXT,
17
- latency REAL,
18
- error_rate REAL
19
- )
20
- """)
21
- c.execute("""
22
- CREATE TABLE IF NOT EXISTS alerts (
23
- id INTEGER PRIMARY KEY AUTOINCREMENT,
24
- event_id INTEGER,
25
- alert_type TEXT,
26
- threshold REAL,
27
- timestamp TEXT
28
- )
29
- """)
30
- conn.commit()
31
- conn.close()
32
 
33
- init_db()
 
 
 
 
34
 
35
- # --- Core functions ---
36
- def log_event(component, latency, error_rate):
37
- conn = sqlite3.connect(DB_PATH)
38
- c = conn.cursor()
39
- c.execute("INSERT INTO telemetry (timestamp, component, latency, error_rate) VALUES (?, ?, ?, ?)",
40
- (datetime.now().isoformat(), component, latency, error_rate))
41
- conn.commit()
42
- conn.close()
43
- return detect_anomaly()
44
 
45
- def detect_anomaly(threshold_latency=200, threshold_error=0.3):
46
- conn = sqlite3.connect(DB_PATH)
47
- c = conn.cursor()
48
- c.execute("SELECT * FROM telemetry ORDER BY id DESC LIMIT 1")
49
- row = c.fetchone()
50
- conn.close()
51
- if row:
52
- id, ts, component, latency, error_rate = row
53
- if latency > threshold_latency or error_rate > threshold_error:
54
- alert_msg = f"⚠️ Anomaly detected in {component} — latency {latency}ms, error rate {error_rate}"
55
- save_alert(id, "anomaly", max(latency, error_rate))
56
- return alert_msg
57
- return "✅ No anomaly detected."
58
 
59
- def save_alert(event_id, alert_type, threshold):
60
- conn = sqlite3.connect(DB_PATH)
61
- c = conn.cursor()
62
- c.execute("INSERT INTO alerts (event_id, alert_type, threshold, timestamp) VALUES (?, ?, ?, ?)",
63
- (event_id, alert_type, threshold, datetime.now().isoformat()))
64
- conn.commit()
65
- conn.close()
66
 
67
- def show_recent_alerts():
68
- conn = sqlite3.connect(DB_PATH)
69
- c = conn.cursor()
70
- c.execute("SELECT * FROM alerts ORDER BY id DESC LIMIT 10")
71
- rows = c.fetchall()
72
- conn.close()
73
- if not rows:
74
- return "No alerts yet."
75
- return "\n".join([f"[{r[4]}] {r[2]} (threshold: {r[3]})" for r in rows])
76
 
77
- # --- Gradio UI ---
78
- with gr.Blocks() as demo:
79
- gr.Markdown("# 🧠 Agentic Reliability Framework MVP")
80
- gr.Markdown("Simulate telemetry events and detect anomalies automatically.")
81
-
82
- with gr.Row():
83
- component = gr.Textbox(label="Component", value="api-service")
84
- latency = gr.Number(label="Latency (ms)", value=150)
85
- error_rate = gr.Number(label="Error rate", value=0.05)
86
- btn = gr.Button("Submit Event")
87
- output = gr.Textbox(label="Detection Output")
88
-
89
- btn.click(fn=log_event, inputs=[component, latency, error_rate], outputs=output)
90
-
91
- gr.Markdown("### Recent Alerts")
92
- alert_box = gr.Textbox(label="", interactive=False)
93
- refresh_btn = gr.Button("Refresh Alerts")
94
- refresh_btn.click(fn=show_recent_alerts, outputs=alert_box)
95
 
96
  demo.launch()
 
1
+ import os
2
+ import random
3
  import time
4
+ import gradio as gr
5
+ import pandas as pd
6
+ from huggingface_hub import InferenceClient
7
+
8
+ # === Initialize Hugging Face client ===
9
+ HF_TOKEN = os.getenv("HF_API_TOKEN")
10
+ client = InferenceClient(token=HF_TOKEN)
11
+
12
+ # === Mock telemetry state ===
13
+ events_log = []
14
+
15
+ def simulate_event():
16
+ """Simulate one telemetry datapoint."""
17
+ component = random.choice(["api-service", "data-ingestor", "model-runner", "queue-worker"])
18
+ latency = round(random.gauss(150, 60), 2)
19
+ error_rate = round(random.random() * 0.2, 3)
20
+ timestamp = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
21
+ return {"timestamp": timestamp, "component": component, "latency": latency, "error_rate": error_rate}
22
+
23
+ def detect_anomaly(event):
24
+ """Basic anomaly detection: threshold rule."""
25
+ if event["latency"] > 250 or event["error_rate"] > 0.1:
26
+ return True
27
+ return False
28
 
29
+ def analyze_cause(event):
30
+ """Use an LLM to interpret and explain anomalies."""
31
+ prompt = f"""
32
+ You are an AI reliability engineer analyzing telemetry.
33
+ Component: {event['component']}
34
+ Latency: {event['latency']}ms
35
+ Error Rate: {event['error_rate']}
36
+ Timestamp: {event['timestamp']}
37
 
38
+ Explain in plain English the likely root cause of this anomaly and one safe auto-healing action to take.
39
+ """
40
+ try:
41
+ response = client.text_generation(
42
+ model="mistralai/Mixtral-8x7B-Instruct-v0.1",
43
+ prompt=prompt,
44
+ max_new_tokens=180
45
+ )
46
+ return response.strip()
47
+ except Exception as e:
48
+ return f"Error generating analysis: {e}"
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
+ def process_event():
51
+ """Simulate event → detect → diagnose → log."""
52
+ event = simulate_event()
53
+ is_anomaly = detect_anomaly(event)
54
+ result = {"event": event, "anomaly": is_anomaly, "analysis": None}
55
 
56
+ if is_anomaly:
57
+ analysis = analyze_cause(event)
58
+ result["analysis"] = analysis
59
+ event["analysis"] = analysis
60
+ event["status"] = "Anomaly"
61
+ else:
62
+ event["analysis"] = "-"
63
+ event["status"] = "Normal"
 
64
 
65
+ events_log.append(event)
66
+ df = pd.DataFrame(events_log).tail(15)
67
+ return f"✅ Event Processed ({event['status']})", df
 
 
 
 
 
 
 
 
 
 
68
 
69
+ # === Gradio UI ===
70
+ with gr.Blocks(title="🧠 Agentic Reliability Framework MVP") as demo:
71
+ gr.Markdown("# 🧠 Agentic Reliability Framework MVP\n### Real-time anomaly detection + AI-driven diagnostics")
 
 
 
 
72
 
73
+ run_btn = gr.Button("🚀 Submit Telemetry Event")
74
+ status = gr.Textbox(label="Detection Output")
75
+ alerts = gr.Dataframe(headers=["timestamp", "component", "latency", "error_rate", "status", "analysis"],
76
+ label="Recent Events (Last 15)", wrap=True)
 
 
 
 
 
77
 
78
+ run_btn.click(fn=process_event, inputs=None, outputs=[status, alerts])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
 
80
  demo.launch()