petter2025 commited on
Commit
8974b1e
Β·
verified Β·
1 Parent(s): 42fb0d1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +74 -15
app.py CHANGED
@@ -4,6 +4,7 @@ import time
4
  import gradio as gr
5
  import pandas as pd
6
  from huggingface_hub import InferenceClient
 
7
 
8
  # === Initialize Hugging Face client ===
9
  HF_TOKEN = os.getenv("HF_API_TOKEN")
@@ -11,23 +12,48 @@ client = InferenceClient(token=HF_TOKEN)
11
 
12
  # === Mock telemetry state ===
13
  events_log = []
 
14
 
15
- def simulate_event():
 
 
 
 
 
 
16
  """Simulate one telemetry datapoint."""
17
  component = random.choice(["api-service", "data-ingestor", "model-runner", "queue-worker"])
18
- latency = round(random.gauss(150, 60), 2)
19
- error_rate = round(random.random() * 0.2, 3)
 
 
 
 
20
  timestamp = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
21
  return {"timestamp": timestamp, "component": component, "latency": latency, "error_rate": error_rate}
22
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  def detect_anomaly(event):
24
- """Basic anomaly detection: threshold rule."""
25
- if event["latency"] > 250 or event["error_rate"] > 0.1:
 
26
  return True
27
  return False
28
 
 
29
  def analyze_cause(event):
30
- """Use an LLM to interpret and explain anomalies."""
31
  prompt = f"""
32
  You are an AI reliability engineer analyzing telemetry.
33
  Component: {event['component']}
@@ -35,7 +61,10 @@ def analyze_cause(event):
35
  Error Rate: {event['error_rate']}
36
  Timestamp: {event['timestamp']}
37
 
38
- Explain in plain English the likely root cause of this anomaly and one safe auto-healing action to take.
 
 
 
39
  """
40
  try:
41
  response = client.text_generation(
@@ -47,33 +76,63 @@ def analyze_cause(event):
47
  except Exception as e:
48
  return f"Error generating analysis: {e}"
49
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  def process_event():
51
- """Simulate event β†’ detect β†’ diagnose β†’ log."""
52
- event = simulate_event()
 
 
 
 
 
 
53
  is_anomaly = detect_anomaly(event)
54
- result = {"event": event, "anomaly": is_anomaly, "analysis": None}
55
 
56
  if is_anomaly:
57
  analysis = analyze_cause(event)
58
- result["analysis"] = analysis
59
  event["analysis"] = analysis
60
  event["status"] = "Anomaly"
 
 
 
 
 
 
 
 
 
61
  else:
62
  event["analysis"] = "-"
63
  event["status"] = "Normal"
 
64
 
65
  events_log.append(event)
66
- df = pd.DataFrame(events_log).tail(15)
67
  return f"βœ… Event Processed ({event['status']})", df
68
 
 
69
  # === Gradio UI ===
70
  with gr.Blocks(title="🧠 Agentic Reliability Framework MVP") as demo:
71
- gr.Markdown("# 🧠 Agentic Reliability Framework MVP\n### Real-time anomaly detection + AI-driven diagnostics")
72
 
73
  run_btn = gr.Button("πŸš€ Submit Telemetry Event")
74
  status = gr.Textbox(label="Detection Output")
75
- alerts = gr.Dataframe(headers=["timestamp", "component", "latency", "error_rate", "status", "analysis"],
76
- label="Recent Events (Last 15)", wrap=True)
77
 
78
  run_btn.click(fn=process_event, inputs=None, outputs=[status, alerts])
79
 
 
4
  import gradio as gr
5
  import pandas as pd
6
  from huggingface_hub import InferenceClient
7
+ from statistics import mean
8
 
9
  # === Initialize Hugging Face client ===
10
  HF_TOKEN = os.getenv("HF_API_TOKEN")
 
12
 
13
  # === Mock telemetry state ===
14
  events_log = []
15
+ anomaly_counter = 0
16
 
17
+ # === Configurable parameters ===
18
+ ROLLING_WINDOW = 30
19
+ LATENCY_BASE_THRESHOLD = 150
20
+ ERROR_BASE_THRESHOLD = 0.05
21
+
22
+
23
+ def simulate_event(force_anomaly=False):
24
  """Simulate one telemetry datapoint."""
25
  component = random.choice(["api-service", "data-ingestor", "model-runner", "queue-worker"])
26
+ if force_anomaly:
27
+ latency = round(random.uniform(260, 400), 2)
28
+ error_rate = round(random.uniform(0.12, 0.25), 3)
29
+ else:
30
+ latency = round(random.gauss(150, 60), 2)
31
+ error_rate = round(random.random() * 0.2, 3)
32
  timestamp = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
33
  return {"timestamp": timestamp, "component": component, "latency": latency, "error_rate": error_rate}
34
 
35
+
36
+ def adaptive_thresholds():
37
+ """Compute dynamic thresholds based on rolling averages."""
38
+ if len(events_log) < ROLLING_WINDOW:
39
+ return LATENCY_BASE_THRESHOLD, ERROR_BASE_THRESHOLD
40
+ latencies = [e["latency"] for e in events_log[-ROLLING_WINDOW:]]
41
+ errors = [e["error_rate"] for e in events_log[-ROLLING_WINDOW:]]
42
+ adaptive_latency = mean(latencies) * 1.25
43
+ adaptive_error = mean(errors) * 1.5
44
+ return adaptive_latency, adaptive_error
45
+
46
+
47
  def detect_anomaly(event):
48
+ """Adaptive anomaly detection."""
49
+ lat_thresh, err_thresh = adaptive_thresholds()
50
+ if event["latency"] > lat_thresh or event["error_rate"] > err_thresh:
51
  return True
52
  return False
53
 
54
+
55
  def analyze_cause(event):
56
+ """Use LLM to interpret and explain anomalies."""
57
  prompt = f"""
58
  You are an AI reliability engineer analyzing telemetry.
59
  Component: {event['component']}
 
61
  Error Rate: {event['error_rate']}
62
  Timestamp: {event['timestamp']}
63
 
64
+ Explain the likely root cause and one safe auto-healing action.
65
+ Output in this format:
66
+ Cause: <short cause summary>
67
+ Action: <short repair suggestion>
68
  """
69
  try:
70
  response = client.text_generation(
 
76
  except Exception as e:
77
  return f"Error generating analysis: {e}"
78
 
79
+
80
+ def simulate_healing(action_text):
81
+ """Mock execution of a self-healing action."""
82
+ if "restart" in action_text.lower():
83
+ outcome = "βœ… Service restarted successfully."
84
+ elif "reset" in action_text.lower():
85
+ outcome = "βœ… Connection reset resolved issue."
86
+ elif "cache" in action_text.lower():
87
+ outcome = "βœ… Cache cleared; metrics normalizing."
88
+ else:
89
+ outcome = "πŸ•’ Monitoring post-action stabilization."
90
+ return outcome
91
+
92
+
93
  def process_event():
94
+ """Simulate event β†’ detect β†’ diagnose β†’ heal β†’ log."""
95
+ global anomaly_counter
96
+
97
+ # Force an anomaly every 4 events
98
+ anomaly_counter += 1
99
+ force_anomaly = anomaly_counter % 4 == 0
100
+
101
+ event = simulate_event(force_anomaly=force_anomaly)
102
  is_anomaly = detect_anomaly(event)
103
+ result = {"event": event, "anomaly": is_anomaly, "analysis": None, "healing_action": None}
104
 
105
  if is_anomaly:
106
  analysis = analyze_cause(event)
 
107
  event["analysis"] = analysis
108
  event["status"] = "Anomaly"
109
+
110
+ # Attempt to extract and simulate healing
111
+ if "Action:" in analysis:
112
+ action_line = analysis.split("Action:")[-1].strip()
113
+ healing_outcome = simulate_healing(action_line)
114
+ event["healing_action"] = healing_outcome
115
+ else:
116
+ event["healing_action"] = "No actionable step detected."
117
+
118
  else:
119
  event["analysis"] = "-"
120
  event["status"] = "Normal"
121
+ event["healing_action"] = "-"
122
 
123
  events_log.append(event)
124
+ df = pd.DataFrame(events_log).tail(20)
125
  return f"βœ… Event Processed ({event['status']})", df
126
 
127
+
128
  # === Gradio UI ===
129
  with gr.Blocks(title="🧠 Agentic Reliability Framework MVP") as demo:
130
+ gr.Markdown("# 🧠 Agentic Reliability Framework MVP\n### Adaptive anomaly detection + AI-driven self-healing simulation")
131
 
132
  run_btn = gr.Button("πŸš€ Submit Telemetry Event")
133
  status = gr.Textbox(label="Detection Output")
134
+ alerts = gr.Dataframe(headers=["timestamp", "component", "latency", "error_rate", "status", "analysis", "healing_action"],
135
+ label="Recent Events (Last 20)", wrap=True)
136
 
137
  run_btn.click(fn=process_event, inputs=None, outputs=[status, alerts])
138