petter2025 commited on
Commit
2d521fd
·
verified ·
1 Parent(s): bc93cf1

Add FastAPI app

Browse files
app/__init__.py ADDED
File without changes
app/api/__init__.py ADDED
File without changes
app/api/deps.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ from app.database.session import SessionLocal
3
+ from slowapi import Limiter
4
+ from slowapi.util import get_remote_address
5
+ from app.core.config import settings
6
+
7
+ # ARF core engine imports
8
+ from agentic_reliability_framework.core.governance.risk_engine import RiskEngine
9
+ from agentic_reliability_framework.core.decision.decision_engine import DecisionEngine
10
+ from agentic_reliability_framework.core.governance.stability_controller import LyapunovStabilityController
11
+ from agentic_reliability_framework.core.governance.causal_explainer import CausalExplainer
12
+ from agentic_reliability_framework.runtime.memory.rag_graph import RAGGraphMemory
13
+ from agentic_reliability_framework.core.models.event import ReliabilityEvent, HealingAction
14
+
15
+
16
+ # Dependency to get DB session
17
+ def get_db():
18
+ db = SessionLocal()
19
+ try:
20
+ yield db
21
+ finally:
22
+ db.close()
23
+
24
+
25
+ # Rate limiter with default limit from settings
26
+ limiter = Limiter(key_func=get_remote_address, default_limits=[settings.RATE_LIMIT])
27
+
28
+
29
+ # ARF engine dependencies (singletons for simplicity)
30
+ _risk_engine = None
31
+ _decision_engine = None
32
+ _stability_controller = None
33
+ _causal_explainer = None
34
+ _rag_graph = None
35
+
36
+
37
+ def _seed_rag_graph(rag):
38
+ """Seed the RAG graph with historical healing action outcomes."""
39
+ seed_data = [
40
+ ("seed_restart_1", "test", HealingAction.RESTART_CONTAINER.value, True, 2),
41
+ ("seed_restart_2", "test", HealingAction.RESTART_CONTAINER.value, True, 3),
42
+ ("seed_restart_3", "test", HealingAction.RESTART_CONTAINER.value, False, 10),
43
+ ("seed_rollback_1", "test", HealingAction.ROLLBACK.value, True, 1),
44
+ ("seed_rollback_2", "test", HealingAction.ROLLBACK.value, True, 2),
45
+ ("seed_rollback_3", "test", HealingAction.ROLLBACK.value, False, 5),
46
+ ("seed_scale_1", "test", HealingAction.SCALE_OUT.value, True, 5),
47
+ ("seed_scale_2", "test", HealingAction.SCALE_OUT.value, False, 15),
48
+ ("seed_cb_1", "test", HealingAction.CIRCUIT_BREAKER.value, True, 1),
49
+ ("seed_cb_2", "test", HealingAction.CIRCUIT_BREAKER.value, True, 2),
50
+ ("seed_ts_1", "test", HealingAction.TRAFFIC_SHIFT.value, True, 4),
51
+ ("seed_ts_2", "test", HealingAction.TRAFFIC_SHIFT.value, False, 8),
52
+ ]
53
+ for inc_id, comp, action, success, res_time in seed_data:
54
+ event = ReliabilityEvent(
55
+ component=comp,
56
+ latency_p99=500,
57
+ error_rate=0.1,
58
+ service_mesh="default"
59
+ )
60
+ rag.record_outcome(
61
+ incident_id=inc_id,
62
+ event=event,
63
+ action_taken=action,
64
+ success=success,
65
+ resolution_time_minutes=res_time
66
+ )
67
+ print("Seeded RAG graph with historical data", file=sys.stderr)
68
+
69
+
70
+ def get_rag_graph():
71
+ global _rag_graph
72
+ if _rag_graph is None:
73
+ _rag_graph = RAGGraphMemory()
74
+ _seed_rag_graph(_rag_graph)
75
+ return _rag_graph
76
+
77
+
78
+ def get_decision_engine():
79
+ global _decision_engine
80
+ if _decision_engine is None:
81
+ rag = get_rag_graph()
82
+ _decision_engine = DecisionEngine(rag_graph=rag)
83
+ return _decision_engine
84
+
85
+
86
+ def get_risk_engine():
87
+ global _risk_engine
88
+ if _risk_engine is None:
89
+ _risk_engine = RiskEngine()
90
+ return _risk_engine
91
+
92
+
93
+ def get_stability_controller():
94
+ global _stability_controller
95
+ if _stability_controller is None:
96
+ _stability_controller = LyapunovStabilityController()
97
+ return _stability_controller
98
+
99
+
100
+ def get_causal_explainer():
101
+ global _causal_explainer
102
+ if _causal_explainer is None:
103
+ _causal_explainer = CausalExplainer()
104
+ return _causal_explainer
app/api/routes_admin.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Admin API endpoints for API key management and audit logs.
3
+ These endpoints should be protected (e.g., by an admin API key) in production.
4
+ """
5
+ from fastapi import APIRouter, Depends, HTTPException, Query, Path, Body
6
+ from pydantic import BaseModel
7
+ from typing import Optional, List, Dict, Any
8
+ from datetime import datetime
9
+ import uuid
10
+
11
+ from app.core.usage_tracker import tracker, Tier
12
+
13
+ router = APIRouter(prefix="/admin", tags=["admin"])
14
+
15
+ # Simple in‑memory admin key (replace with proper auth in production)
16
+ ADMIN_API_KEY = "admin_secret_change_me"
17
+
18
+ def verify_admin(admin_key: str = Query(..., alias="admin_key")):
19
+ if admin_key != ADMIN_API_KEY:
20
+ raise HTTPException(status_code=403, detail="Invalid admin key")
21
+ return True
22
+
23
+ class CreateKeyRequest(BaseModel):
24
+ tier: str
25
+
26
+ class UpdateTierRequest(BaseModel):
27
+ tier: str
28
+
29
+
30
+ @router.post("/keys", dependencies=[Depends(verify_admin)])
31
+ async def create_api_key(req: CreateKeyRequest):
32
+ if req.tier not in [t.value for t in Tier]:
33
+ raise HTTPException(status_code=400, detail=f"Invalid tier. Must be one of {[t.value for t in Tier]}")
34
+ new_key = f"sk_live_{uuid.uuid4().hex[:24]}"
35
+ tier_enum = Tier(req.tier)
36
+ tracker.get_or_create_api_key(new_key, tier_enum)
37
+ return {"api_key": new_key, "tier": req.tier}
38
+
39
+
40
+ @router.get("/keys", dependencies=[Depends(verify_admin)])
41
+ async def list_api_keys(limit: int = 100, offset: int = 0):
42
+ with tracker._get_conn() as conn:
43
+ rows = conn.execute(
44
+ "SELECT key, tier, created_at, last_used_at, is_active FROM api_keys ORDER BY created_at DESC LIMIT ? OFFSET ?",
45
+ (limit, offset)
46
+ ).fetchall()
47
+ keys = []
48
+ for row in rows:
49
+ month = tracker._get_month_key()
50
+ usage_row = conn.execute(
51
+ "SELECT count FROM monthly_counts WHERE api_key = ? AND year_month = ?",
52
+ (row["key"], month)
53
+ ).fetchone()
54
+ usage = usage_row["count"] if usage_row else 0
55
+ keys.append({
56
+ "key": row["key"],
57
+ "tier": row["tier"],
58
+ "created_at": datetime.fromtimestamp(row["created_at"]).isoformat(),
59
+ "last_used_at": datetime.fromtimestamp(row["last_used_at"]).isoformat() if row["last_used_at"] else None,
60
+ "is_active": bool(row["is_active"]),
61
+ "current_month_usage": usage,
62
+ })
63
+ return {"keys": keys, "total": len(keys)}
64
+
65
+
66
+ @router.patch("/keys/{api_key}/tier", dependencies=[Depends(verify_admin)])
67
+ async def update_key_tier(
68
+ api_key: str = Path(..., description="The API key to update"),
69
+ req: UpdateTierRequest = Body(...),
70
+ ):
71
+ if req.tier not in [t.value for t in Tier]:
72
+ raise HTTPException(status_code=400, detail=f"Invalid tier. Must be one of {[t.value for t in Tier]}")
73
+ with tracker._get_conn() as conn:
74
+ row = conn.execute("SELECT key FROM api_keys WHERE key = ?", (api_key,)).fetchone()
75
+ if not row:
76
+ raise HTTPException(status_code=404, detail="API key not found")
77
+ conn.execute("UPDATE api_keys SET tier = ? WHERE key = ?", (req.tier, api_key))
78
+ conn.commit()
79
+ return {"message": f"Tier updated to {req.tier}"}
80
+
81
+
82
+ @router.delete("/keys/{api_key}", dependencies=[Depends(verify_admin)])
83
+ async def deactivate_api_key(api_key: str = Path(..., description="The API key to deactivate")):
84
+ with tracker._get_conn() as conn:
85
+ row = conn.execute("SELECT key FROM api_keys WHERE key = ?", (api_key,)).fetchone()
86
+ if not row:
87
+ raise HTTPException(status_code=404, detail="API key not found")
88
+ conn.execute("UPDATE api_keys SET is_active = 0 WHERE key = ?", (api_key,))
89
+ conn.commit()
90
+ return {"message": "API key deactivated"}
91
+
92
+
93
+ @router.get("/audit/{api_key}", dependencies=[Depends(verify_admin)])
94
+ async def get_audit_logs(
95
+ api_key: str = Path(..., description="The API key to audit"),
96
+ start_date: Optional[str] = Query(None),
97
+ end_date: Optional[str] = Query(None),
98
+ limit: int = 100,
99
+ ):
100
+ start = datetime.fromisoformat(start_date) if start_date else None
101
+ end = datetime.fromisoformat(end_date) if end_date else None
102
+ logs = tracker.get_audit_logs(api_key, start, end, limit)
103
+ return {"api_key": api_key, "logs": logs}
104
+
105
+
106
+ @router.get("/stats", dependencies=[Depends(verify_admin)])
107
+ async def get_global_stats():
108
+ with tracker._get_conn() as conn:
109
+ total_keys = conn.execute("SELECT COUNT(*) FROM api_keys WHERE is_active = 1").fetchone()[0]
110
+ total_requests = conn.execute("SELECT COUNT(*) FROM usage_log").fetchone()[0]
111
+ by_tier = conn.execute(
112
+ "SELECT tier, COUNT(*) as count FROM usage_log GROUP BY tier"
113
+ ).fetchall()
114
+ month = tracker._get_month_key()
115
+ current_month_requests = conn.execute(
116
+ "SELECT SUM(count) FROM monthly_counts WHERE year_month = ?", (month,)
117
+ ).fetchone()[0] or 0
118
+ return {
119
+ "active_api_keys": total_keys,
120
+ "total_evaluations": total_requests,
121
+ "current_month_evaluations": current_month_requests,
122
+ "by_tier": [{"tier": row[0], "count": row[1]} for row in by_tier],
123
+ }
app/api/routes_governance.py ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter, Depends, HTTPException, Request, BackgroundTasks
2
+ from fastapi.encoders import jsonable_encoder
3
+ from sqlalchemy.orm import Session
4
+ from app.models.infrastructure_intents import InfrastructureIntentRequest
5
+ from app.services.intent_adapter import to_oss_intent
6
+ from app.services.risk_service import evaluate_intent, evaluate_healing_decision
7
+ from app.services.intent_store import save_evaluated_intent
8
+ from app.services.outcome_service import record_outcome
9
+ from app.api.deps import get_db
10
+ from pydantic import BaseModel
11
+ import uuid
12
+ import logging
13
+ import time
14
+
15
+ from agentic_reliability_framework.core.models.event import ReliabilityEvent
16
+
17
+ # ===== USAGE TRACKER IMPORTS =====
18
+ from app.core.usage_tracker import enforce_quota, UsageRecord, tracker
19
+
20
+ logger = logging.getLogger(__name__)
21
+ router = APIRouter()
22
+
23
+
24
+ class OutcomeRequest(BaseModel):
25
+ deterministic_id: str
26
+ success: bool
27
+ recorded_by: str
28
+ notes: str = ""
29
+
30
+
31
+ class HealingDecisionRequest(BaseModel):
32
+ event: ReliabilityEvent
33
+
34
+
35
+ @router.post("/intents/evaluate")
36
+ async def evaluate_intent_endpoint(
37
+ request: Request,
38
+ intent_req: InfrastructureIntentRequest,
39
+ background_tasks: BackgroundTasks,
40
+ db: Session = Depends(get_db),
41
+ quota: dict = Depends(enforce_quota)
42
+ ):
43
+ start_time = time.time()
44
+ api_key = quota["api_key"]
45
+ tier = quota["tier"]
46
+ response_data = None
47
+ error_msg = None
48
+
49
+ try:
50
+ oss_intent = to_oss_intent(intent_req)
51
+ risk_engine = request.app.state.risk_engine
52
+ result = evaluate_intent(
53
+ engine=risk_engine,
54
+ intent=oss_intent,
55
+ cost_estimate=intent_req.estimated_cost,
56
+ policy_violations=intent_req.policy_violations
57
+ )
58
+
59
+ deterministic_id = str(uuid.uuid4())
60
+ api_payload = jsonable_encoder(intent_req.model_dump())
61
+ oss_payload = jsonable_encoder(oss_intent.model_dump())
62
+
63
+ save_evaluated_intent(
64
+ db=db,
65
+ deterministic_id=deterministic_id,
66
+ intent_type=intent_req.intent_type,
67
+ api_payload=api_payload,
68
+ oss_payload=oss_payload,
69
+ environment=str(intent_req.environment),
70
+ risk_score=result["risk_score"]
71
+ )
72
+
73
+ result["intent_id"] = deterministic_id
74
+ response_data = result
75
+
76
+ if tracker:
77
+ record = UsageRecord(
78
+ api_key=api_key,
79
+ tier=tier,
80
+ timestamp=time.time(),
81
+ endpoint="/api/v1/intents/evaluate",
82
+ request_body=intent_req.model_dump(),
83
+ response=response_data,
84
+ processing_ms=(time.time() - start_time) * 1000,
85
+ )
86
+ await tracker.increment_usage_async(record, background_tasks)
87
+
88
+ return response_data
89
+
90
+ except HTTPException:
91
+ raise
92
+ except Exception as e:
93
+ error_msg = str(e)
94
+ logger.exception("Error in evaluate_intent_endpoint")
95
+ if tracker:
96
+ record = UsageRecord(
97
+ api_key=api_key,
98
+ tier=tier,
99
+ timestamp=time.time(),
100
+ endpoint="/api/v1/intents/evaluate",
101
+ request_body=intent_req.model_dump(),
102
+ error=error_msg,
103
+ processing_ms=(time.time() - start_time) * 1000,
104
+ )
105
+ await tracker.increment_usage_async(record, background_tasks)
106
+ raise HTTPException(status_code=500, detail=error_msg)
107
+
108
+
109
+ @router.post("/intents/outcome")
110
+ async def record_outcome_endpoint(
111
+ request: Request,
112
+ outcome: OutcomeRequest,
113
+ db: Session = Depends(get_db)
114
+ ):
115
+ # No usage tracking for outcomes (doesn't count against quota)
116
+ try:
117
+ risk_engine = request.app.state.risk_engine
118
+ outcome_record = record_outcome(
119
+ db=db,
120
+ deterministic_id=outcome.deterministic_id,
121
+ success=outcome.success,
122
+ recorded_by=outcome.recorded_by,
123
+ notes=outcome.notes,
124
+ risk_engine=risk_engine
125
+ )
126
+ return {"message": "Outcome recorded", "outcome_id": outcome_record.id}
127
+ except Exception as e:
128
+ raise HTTPException(status_code=500, detail=str(e))
129
+
130
+
131
+ @router.post("/healing/evaluate")
132
+ async def evaluate_healing_decision_endpoint(
133
+ request: Request,
134
+ decision_req: HealingDecisionRequest,
135
+ background_tasks: BackgroundTasks,
136
+ quota: dict = Depends(enforce_quota)
137
+ ):
138
+ start_time = time.time()
139
+ api_key = quota["api_key"]
140
+ tier = quota["tier"]
141
+ response_data = None
142
+ error_msg = None
143
+
144
+ try:
145
+ policy_engine = request.app.state.policy_engine
146
+ rag_graph = getattr(request.app.state, "rag_graph", None)
147
+ model = getattr(request.app.state, "epistemic_model", None)
148
+ tokenizer = getattr(request.app.state, "epistemic_tokenizer", None)
149
+
150
+ response_data = evaluate_healing_decision(
151
+ event=decision_req.event,
152
+ policy_engine=policy_engine,
153
+ decision_engine=None,
154
+ rag_graph=rag_graph,
155
+ model=model,
156
+ tokenizer=tokenizer,
157
+ )
158
+
159
+ if tracker:
160
+ record = UsageRecord(
161
+ api_key=api_key,
162
+ tier=tier,
163
+ timestamp=time.time(),
164
+ endpoint="/api/v1/healing/evaluate",
165
+ request_body=decision_req.model_dump(),
166
+ response=response_data,
167
+ processing_ms=(time.time() - start_time) * 1000,
168
+ )
169
+ await tracker.increment_usage_async(record, background_tasks)
170
+
171
+ return response_data
172
+
173
+ except HTTPException:
174
+ raise
175
+ except Exception as e:
176
+ error_msg = str(e)
177
+ logger.exception("Error in evaluate_healing_decision_endpoint")
178
+ if tracker:
179
+ record = UsageRecord(
180
+ api_key=api_key,
181
+ tier=tier,
182
+ timestamp=time.time(),
183
+ endpoint="/api/v1/healing/evaluate",
184
+ request_body=decision_req.model_dump(),
185
+ error=error_msg,
186
+ processing_ms=(time.time() - start_time) * 1000,
187
+ )
188
+ await tracker.increment_usage_async(record, background_tasks)
189
+ raise HTTPException(status_code=500, detail=error_msg)
app/api/routes_history.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter
2
+ from app.core.storage import incident_history
3
+
4
+ router = APIRouter()
5
+
6
+
7
+ @router.get("/history")
8
+ async def get_history():
9
+ return {"incidents": incident_history}
app/api/routes_incidents.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from app.causal_explainer import CausalExplainer
2
+ from fastapi import APIRouter, Depends, Request, BackgroundTasks, HTTPException
3
+ from pydantic import BaseModel
4
+ from typing import Optional
5
+ from enum import Enum
6
+ import time
7
+ import json
8
+
9
+ # ===== USAGE TRACKER IMPORTS =====
10
+ from app.core.usage_tracker import enforce_quota, UsageRecord, tracker
11
+
12
+
13
+ class HealingAction(str, Enum):
14
+ NO_ACTION = "no_action"
15
+ RESTART_CONTAINER = "restart_container"
16
+ SCALE_OUT = "scale_out"
17
+ ROLLBACK = "rollback"
18
+ CIRCUIT_BREAKER = "circuit_breaker"
19
+ TRAFFIC_SHIFT = "traffic_shift"
20
+ ALERT_TEAM = "alert_team"
21
+
22
+
23
+ class ReliabilityEvent(BaseModel):
24
+ component: str
25
+ latency_p99: float
26
+ error_rate: float
27
+ service_mesh: str = "default"
28
+ cpu_util: Optional[float] = None
29
+ memory_util: Optional[float] = None
30
+
31
+
32
+ router = APIRouter()
33
+ incident_history = []
34
+
35
+
36
+ @router.post("/report_incident")
37
+ async def report_incident(event: ReliabilityEvent):
38
+ incident_history.append(event.dict())
39
+ return {"status": "recorded"}
40
+
41
+
42
+ @router.post("/v1/incidents/evaluate")
43
+ async def evaluate_incident(
44
+ request: Request,
45
+ event: ReliabilityEvent,
46
+ background_tasks: BackgroundTasks,
47
+ quota: dict = Depends(enforce_quota)
48
+ ):
49
+ start_time = time.time()
50
+ api_key = quota["api_key"]
51
+ tier = quota["tier"]
52
+ response_data = None
53
+ error_msg = None
54
+
55
+ try:
56
+ # Simple risk score (heuristic)
57
+ risk_score = min(1.0, (event.latency_p99 / 1000.0) * 0.7 + event.error_rate * 0.3)
58
+
59
+ if event.latency_p99 > 500 or event.error_rate > 0.15:
60
+ optimal_action = HealingAction.RESTART_CONTAINER
61
+ else:
62
+ optimal_action = HealingAction.NO_ACTION
63
+
64
+ current_state = {
65
+ "latency": event.latency_p99,
66
+ "error_rate": event.error_rate,
67
+ "last_action": {"action_type": "no_action"}
68
+ }
69
+ proposed_action = {"action_type": optimal_action.value, "params": {}}
70
+ ce = CausalExplainer()
71
+ causal_exp = ce.explain_healing_intent(proposed_action, current_state, "latency")
72
+
73
+ healing_intent = {
74
+ "action": optimal_action.value,
75
+ "component": event.component,
76
+ "parameters": proposed_action["params"],
77
+ "justification": f"Causal: {causal_exp.explanation_text}",
78
+ "confidence": 0.85,
79
+ "risk_score": risk_score,
80
+ "status": "oss_advisory_only"
81
+ }
82
+
83
+ response_data = {
84
+ "healing_intent": healing_intent,
85
+ "causal_explanation": {
86
+ "factual_outcome": causal_exp.factual_outcome,
87
+ "counterfactual_outcome": causal_exp.counterfactual_outcome,
88
+ "effect": causal_exp.effect,
89
+ "explanation_text": causal_exp.explanation_text,
90
+ "is_model_based": causal_exp.is_model_based,
91
+ "warnings": causal_exp.warnings
92
+ },
93
+ "utility_decision": {
94
+ "best_action": optimal_action.value,
95
+ "expected_utility": 0.5,
96
+ "explanation": "Heuristic decision based on latency/error thresholds"
97
+ }
98
+ }
99
+
100
+ # Asynchronous usage logging
101
+ if tracker:
102
+ record = UsageRecord(
103
+ api_key=api_key,
104
+ tier=tier,
105
+ timestamp=time.time(),
106
+ endpoint="/v1/incidents/evaluate",
107
+ request_body=event.dict(),
108
+ response=response_data,
109
+ processing_ms=(time.time() - start_time) * 1000,
110
+ )
111
+ await tracker.increment_usage_async(record, background_tasks)
112
+
113
+ return response_data
114
+
115
+ except HTTPException:
116
+ raise
117
+ except Exception as e:
118
+ error_msg = str(e)
119
+ # Log failure in background
120
+ if tracker:
121
+ record = UsageRecord(
122
+ api_key=api_key,
123
+ tier=tier,
124
+ timestamp=time.time(),
125
+ endpoint="/v1/incidents/evaluate",
126
+ request_body=event.dict(),
127
+ error=error_msg,
128
+ processing_ms=(time.time() - start_time) * 1000,
129
+ )
130
+ await tracker.increment_usage_async(record, background_tasks)
131
+ raise HTTPException(status_code=500, detail=error_msg)
app/api/routes_intents.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter, HTTPException
2
+ from app.models.intent_models import IntentSimulation, IntentSimulationResponse
3
+ from app.services.intent_service import simulate_intent
4
+
5
+ router = APIRouter()
6
+
7
+
8
+ @router.post("/simulate_intent", response_model=IntentSimulationResponse)
9
+ async def simulate_intent_endpoint(intent: IntentSimulation):
10
+ try:
11
+ result = simulate_intent(intent)
12
+ return IntentSimulationResponse(**result)
13
+ except Exception as e:
14
+ raise HTTPException(status_code=500, detail=str(e))
app/api/routes_memory.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter, Request
2
+
3
+ router = APIRouter()
4
+
5
+
6
+ @router.get("/stats")
7
+ async def memory_stats(request: Request):
8
+ """
9
+ Return current memory graph statistics using the risk engine's memory instance.
10
+ """
11
+ risk_engine = request.app.state.risk_engine
12
+
13
+ # Check if memory exists and has the required method
14
+ if hasattr(risk_engine, 'memory') and hasattr(risk_engine.memory, 'get_graph_stats'):
15
+ stats = risk_engine.memory.get_graph_stats()
16
+ return stats
17
+ else:
18
+ # Graceful fallback (e.g., during testing or if memory not initialized)
19
+ return {
20
+ "incident_nodes": 0,
21
+ "outcome_nodes": 0,
22
+ "edges": 0,
23
+ "message": "Memory not fully initialized"
24
+ }
app/api/routes_payments.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Payment endpoints – Stripe Checkout integration.
3
+ """
4
+
5
+ import os
6
+ import stripe
7
+ from fastapi import APIRouter, HTTPException, Request
8
+ from pydantic import BaseModel
9
+ from typing import Optional
10
+
11
+ from app.core.config import settings
12
+ from app.core.usage_tracker import tracker, Tier
13
+
14
+ router = APIRouter(prefix="/payments", tags=["payments"])
15
+
16
+ # Set Stripe API key (from environment)
17
+ stripe.api_key = os.getenv("STRIPE_SECRET_KEY")
18
+ STRIPE_WEBHOOK_SECRET = os.getenv("STRIPE_WEBHOOK_SECRET")
19
+
20
+ class CheckoutRequest(BaseModel):
21
+ api_key: str
22
+ success_url: str
23
+ cancel_url: str
24
+
25
+
26
+ @router.post("/create-checkout-session")
27
+ async def create_checkout_session(req: CheckoutRequest):
28
+ """Create a Stripe Checkout session for the Pro tier."""
29
+ if not stripe.api_key:
30
+ raise HTTPException(status_code=500, detail="Stripe not configured")
31
+
32
+ # Verify the API key exists and is free tier
33
+ tier = tracker.get_tier(req.api_key) if tracker else None
34
+ if tier != Tier.FREE:
35
+ raise HTTPException(status_code=400, detail="Only free tier keys can be upgraded")
36
+
37
+ try:
38
+ checkout_session = stripe.checkout.Session.create(
39
+ payment_method_types=["card"],
40
+ line_items=[
41
+ {
42
+ "price": os.getenv("STRIPE_PRO_PRICE_ID"), # e.g., "price_123"
43
+ "quantity": 1,
44
+ }
45
+ ],
46
+ mode="subscription",
47
+ success_url=req.success_url,
48
+ cancel_url=req.cancel_url,
49
+ metadata={"api_key": req.api_key},
50
+ client_reference_id=req.api_key,
51
+ )
52
+ return {"sessionId": checkout_session.id, "url": checkout_session.url}
53
+ except Exception as e:
54
+ raise HTTPException(status_code=500, detail=str(e))
app/api/routes_risk.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter, HTTPException
2
+ from app.models.risk_models import RiskResponse
3
+ from app.services.risk_service import get_system_risk
4
+
5
+ router = APIRouter()
6
+
7
+
8
+ @router.get("/get_risk", response_model=RiskResponse)
9
+ async def get_risk():
10
+ try:
11
+ risk = get_system_risk()
12
+ if risk < 0.3:
13
+ status = "low"
14
+ elif risk < 0.6:
15
+ status = "moderate"
16
+ elif risk < 0.8:
17
+ status = "high"
18
+ else:
19
+ status = "critical"
20
+ return RiskResponse(system_risk=risk, status=status)
21
+ except Exception as e:
22
+ raise HTTPException(status_code=500, detail=str(e))
23
+
24
+
25
+ @router.get("/history")
26
+ async def get_risk_history():
27
+ """
28
+ Return dummy historical risk data for the last 24 hours.
29
+ Replace with real database query later.
30
+ """
31
+ import random
32
+ import datetime
33
+ now = datetime.datetime.now()
34
+ data = []
35
+ for i in range(24, 0, -1):
36
+ data.append({
37
+ "time": (now - datetime.timedelta(hours=i)).isoformat(),
38
+ "risk": round(random.uniform(0.2, 0.8), 2)
39
+ })
40
+ return data
app/api/routes_users.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ User endpoints – registration and quota information.
3
+ """
4
+
5
+ import uuid
6
+ from fastapi import APIRouter, Depends, HTTPException, Request
7
+ from slowapi import Limiter
8
+ from slowapi.util import get_remote_address
9
+ from app.core.usage_tracker import tracker, enforce_quota, Tier
10
+
11
+ router = APIRouter(prefix="/users", tags=["users"])
12
+
13
+ # Rate limiter for registration (5 per hour per IP)
14
+ limiter = Limiter(key_func=get_remote_address, default_limits=["5/hour"])
15
+
16
+
17
+ @router.post("/register")
18
+ @limiter.limit("5/hour")
19
+ async def register_user(request: Request):
20
+ """
21
+ Public endpoint to create a new free‑tier API key.
22
+ Rate‑limited to 5 requests per hour per IP address.
23
+ """
24
+ if tracker is None:
25
+ raise HTTPException(status_code=503, detail="Usage tracking not available")
26
+
27
+ # Generate a new API key
28
+ new_key = f"sk_free_{uuid.uuid4().hex[:24]}"
29
+
30
+ # Store it as FREE tier
31
+ success = tracker.get_or_create_api_key(new_key, Tier.FREE)
32
+ if not success:
33
+ raise HTTPException(status_code=500, detail="Failed to create API key")
34
+
35
+ return {
36
+ "api_key": new_key,
37
+ "tier": "free",
38
+ "message": "API key created. Store it securely – you won't see it again."
39
+ }
40
+
41
+
42
+ @router.get("/quota")
43
+ async def get_user_quota(request: Request, quota: dict = Depends(enforce_quota)):
44
+ """
45
+ Return the current user's tier and remaining evaluation quota.
46
+ Requires API key in Authorization header.
47
+ """
48
+ tier = quota["tier"]
49
+ remaining = quota["remaining"]
50
+ limit = tier.monthly_evaluation_limit if tier else None
51
+
52
+ return {
53
+ "tier": tier.value,
54
+ "remaining": remaining,
55
+ "limit": limit,
56
+ }
app/api/webhooks.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Stripe webhook handler – updates API key tier on subscription events.
3
+ """
4
+
5
+ import os
6
+ import stripe
7
+ from fastapi import APIRouter, Request, HTTPException
8
+ from app.core.usage_tracker import update_key_tier, Tier
9
+
10
+ router = APIRouter(prefix="/webhooks", tags=["webhooks"])
11
+
12
+ STRIPE_WEBHOOK_SECRET = os.getenv("STRIPE_WEBHOOK_SECRET")
13
+ stripe.api_key = os.getenv("STRIPE_SECRET_KEY")
14
+
15
+
16
+ @router.post("/stripe")
17
+ async def stripe_webhook(request: Request):
18
+ payload = await request.body()
19
+ sig_header = request.headers.get("stripe-signature")
20
+
21
+ if not STRIPE_WEBHOOK_SECRET or not stripe.api_key:
22
+ raise HTTPException(status_code=500, detail="Stripe not configured")
23
+
24
+ try:
25
+ event = stripe.Webhook.construct_event(
26
+ payload, sig_header, STRIPE_WEBHOOK_SECRET
27
+ )
28
+ except ValueError:
29
+ raise HTTPException(status_code=400, detail="Invalid payload")
30
+ except stripe.error.SignatureVerificationError:
31
+ raise HTTPException(status_code=400, detail="Invalid signature")
32
+
33
+ # Handle subscription events
34
+ if event["type"] == "checkout.session.completed":
35
+ session = event["data"]["object"]
36
+ api_key = session.get("client_reference_id") or session.get("metadata", {}).get("api_key")
37
+ if api_key:
38
+ update_key_tier(api_key, Tier.PRO)
39
+ elif event["type"] == "customer.subscription.deleted":
40
+ subscription = event["data"]["object"]
41
+ # You need to store a mapping from subscription ID to API key.
42
+ # For simplicity, we assume you stored it in metadata during checkout.
43
+ # Alternatively, look up by customer ID.
44
+ api_key = subscription.get("metadata", {}).get("api_key")
45
+ if api_key:
46
+ update_key_tier(api_key, Tier.FREE)
47
+
48
+ return {"status": "ok"}
app/causal_explainer.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Causal reasoning module for ARF – OSS Edition.
3
+ Provides counterfactual explanations using deterministic heuristics.
4
+ No external causal libraries required.
5
+ """
6
+ from typing import Dict, Any, Optional, List, Tuple
7
+ from dataclasses import dataclass, field
8
+ import pandas as pd
9
+
10
+
11
+ @dataclass
12
+ class CausalExplanation:
13
+ factual_outcome: float
14
+ counterfactual_outcome: float
15
+ effect: float
16
+ is_model_based: bool
17
+ confidence_interval: Optional[Tuple[float, float]] = None
18
+ explanation_text: str = ""
19
+ warnings: List[str] = field(default_factory=list)
20
+
21
+
22
+ class CausalExplainer:
23
+ """
24
+ Heuristic causal explainer for healing actions.
25
+ Uses domain rules and correlation estimates to produce counterfactuals.
26
+ """
27
+
28
+ def __init__(self, memory_store=None):
29
+ self.memory = memory_store
30
+ self.treatment = "healing_action" # symbolic name
31
+ self.outcome = "latency"
32
+ self._action_impact = {
33
+ "restart_container": {
34
+ "latency_effect": -0.15,
35
+ "error_rate_effect": -0.10},
36
+ "scale_out": {
37
+ "latency_effect": -0.20,
38
+ "error_rate_effect": -0.05},
39
+ "rollback": {
40
+ "latency_effect": -0.25,
41
+ "error_rate_effect": -0.20},
42
+ "circuit_breaker": {
43
+ "latency_effect": -0.05,
44
+ "error_rate_effect": -0.30},
45
+ "traffic_shift": {
46
+ "latency_effect": -0.10,
47
+ "error_rate_effect": -0.10},
48
+ "alert_team": {
49
+ "latency_effect": 0.0,
50
+ "error_rate_effect": 0.0},
51
+ "no_action": {
52
+ "latency_effect": 0.0,
53
+ "error_rate_effect": 0.0},
54
+ }
55
+ self._uncertainty = 0.1 # ±10% confidence interval
56
+
57
+ def _extract_action_intensity(self, action_dict: Dict[str, Any]) -> float:
58
+ action_type = action_dict.get("action_type", "no_action")
59
+ if action_type == "no_action":
60
+ return 0.0
61
+ intensity_map = {
62
+ "restart_container": 0.4,
63
+ "scale_out": 0.6,
64
+ "rollback": 0.8,
65
+ "circuit_breaker": 0.7,
66
+ "traffic_shift": 0.5,
67
+ "alert_team": 0.1,
68
+ }
69
+ return intensity_map.get(action_type, 0.0)
70
+
71
+ def _get_effect_for_action(
72
+ self, action_dict: Dict[str, Any], metric: str) -> float:
73
+ action_type = action_dict.get("action_type", "no_action")
74
+ impacts = self._action_impact.get(
75
+ action_type, self._action_impact["no_action"])
76
+ if metric == "latency":
77
+ return impacts["latency_effect"]
78
+ elif metric == "error_rate":
79
+ return impacts["error_rate_effect"]
80
+ return 0.0
81
+
82
+ def counterfactual_explanation(
83
+ self,
84
+ observed_context: Dict[str, Any],
85
+ alternative_action: Dict[str, Any],
86
+ outcome_name: str = "latency",
87
+ confidence_level: float = 0.95
88
+ ) -> CausalExplanation:
89
+ factual_outcome = observed_context.get(outcome_name, 0.0)
90
+ factual_action = observed_context.get("action_taken", {})
91
+ factual_intensity = self._extract_action_intensity(factual_action)
92
+ alt_intensity = self._extract_action_intensity(alternative_action)
93
+
94
+ effect_frac = self._get_effect_for_action(
95
+ alternative_action, outcome_name)
96
+ if alt_intensity == 0.0 and factual_intensity > 0.0:
97
+ factual_effect = self._get_effect_for_action(
98
+ factual_action, outcome_name)
99
+ effect_frac = -factual_effect
100
+
101
+ counterfactual = factual_outcome * (1.0 + effect_frac)
102
+ counterfactual = max(0.0, counterfactual)
103
+ effect = counterfactual - factual_outcome
104
+ ci_half = abs(effect) * self._uncertainty
105
+ confidence_interval = (
106
+ counterfactual - ci_half,
107
+ counterfactual + ci_half)
108
+
109
+ explanation_text = (
110
+ f"If we apply {
111
+ alternative_action.get(
112
+ 'action_type',
113
+ 'unknown')} instead of " f"{
114
+ factual_action.get(
115
+ 'action_type',
116
+ 'no action')}, {outcome_name} would change " f"from {
117
+ factual_outcome:.2f} to {
118
+ counterfactual:.2f} (Δ = {
119
+ effect:.2f}). " f"Based on heuristic causal model.")
120
+
121
+ return CausalExplanation(
122
+ factual_outcome=factual_outcome,
123
+ counterfactual_outcome=counterfactual,
124
+ effect=effect,
125
+ is_model_based=False,
126
+ confidence_interval=confidence_interval,
127
+ explanation_text=explanation_text,
128
+ warnings=["Using heuristic causal model (no fitted SCM)."]
129
+ )
130
+
131
+ def explain_healing_intent(
132
+ self,
133
+ proposed_action: Dict[str, Any],
134
+ current_state: Dict[str, Any],
135
+ outcome_metric: str = "latency"
136
+ ) -> CausalExplanation:
137
+ observed = {
138
+ outcome_metric: current_state.get(
139
+ outcome_metric, 0.0), "action_taken": current_state.get(
140
+ "last_action", {
141
+ "action_type": "no_action"}), **current_state}
142
+ return self.counterfactual_explanation(
143
+ observed_context=observed,
144
+ alternative_action=proposed_action,
145
+ outcome_name=outcome_metric
146
+ )
147
+
148
+ def discover_graph_from_memory(
149
+ self, data: pd.DataFrame, method: str = "pc") -> Dict[str, Any]:
150
+ return {"nodes": list(data.columns), "edges": []}
151
+
152
+ def fit_scm(
153
+ self,
154
+ data: pd.DataFrame,
155
+ treatment: str,
156
+ outcome: str,
157
+ graph: Optional[Dict] = None):
158
+ self.treatment = treatment
159
+ self.outcome = outcome
160
+
161
+ def estimate_effect(
162
+ self,
163
+ method_name: str = "backdoor.linear_regression") -> Optional[float]:
164
+ return None
app/core/__init__.py ADDED
File without changes
app/core/config.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic_settings import BaseSettings
2
+ from typing import Optional
3
+
4
+
5
+ class Settings(BaseSettings):
6
+ app_name: str = "ARF API Control Plane"
7
+ environment: str = "development"
8
+ database_url: Optional[str] = None
9
+ api_key: Optional[str] = None
10
+ RATE_LIMIT: str = "100/minute" # default limit
11
+
12
+ # Usage tracker settings
13
+ ARF_USAGE_TRACKING: bool = False
14
+ ARF_USAGE_DB_PATH: str = "arf_usage.db"
15
+ ARF_REDIS_URL: Optional[str] = None
16
+ ARF_API_KEYS: str = "{}" # JSON string of {key: tier}
17
+
18
+ class Config:
19
+ env_file = ".env"
20
+ extra = "ignore"
21
+
22
+
23
+ settings = Settings()
app/core/storage.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ # Simple in-memory list for incident history
2
+ incident_history = []
app/core/usage_tracker.py ADDED
@@ -0,0 +1,347 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Usage Tracker for ARF API – quotas, tiers, and audit logging.
3
+ Non‑invasive, configurable, thread‑safe, and background‑task ready.
4
+ """
5
+
6
+ import os
7
+ import json
8
+ import sqlite3
9
+ import threading
10
+ import time
11
+ from contextlib import contextmanager
12
+ from datetime import datetime, timedelta
13
+ from typing import Dict, Any, Optional, List
14
+ from enum import Enum
15
+ from dataclasses import dataclass
16
+ from fastapi import BackgroundTasks
17
+
18
+ # Optional Redis support
19
+ try:
20
+ import redis
21
+ REDIS_AVAILABLE = True
22
+ except ImportError:
23
+ REDIS_AVAILABLE = False
24
+ redis = None
25
+
26
+
27
+ class Tier(str, Enum):
28
+ FREE = "free"
29
+ PRO = "pro"
30
+ PREMIUM = "premium"
31
+ ENTERPRISE = "enterprise"
32
+
33
+ @property
34
+ def monthly_evaluation_limit(self) -> Optional[int]:
35
+ limits = {
36
+ Tier.FREE: 1000,
37
+ Tier.PRO: 10_000,
38
+ Tier.PREMIUM: 50_000,
39
+ Tier.ENTERPRISE: None, # unlimited
40
+ }
41
+ return limits[self]
42
+
43
+ @property
44
+ def audit_log_retention_days(self) -> int:
45
+ retention = {
46
+ Tier.FREE: 7,
47
+ Tier.PRO: 30,
48
+ Tier.PREMIUM: 90,
49
+ Tier.ENTERPRISE: 365,
50
+ }
51
+ return retention[self]
52
+
53
+
54
+ @dataclass
55
+ class UsageRecord:
56
+ """Single evaluation usage record."""
57
+ api_key: str
58
+ tier: Tier
59
+ timestamp: float
60
+ endpoint: str
61
+ request_body: Optional[Dict[str, Any]] = None
62
+ response: Optional[Dict[str, Any]] = None
63
+ error: Optional[str] = None
64
+ processing_ms: Optional[float] = None
65
+
66
+
67
+ class UsageTracker:
68
+ """
69
+ Thread‑safe usage tracker with SQLite storage and optional Redis for counters.
70
+ """
71
+
72
+ def __init__(self, db_path: str = "arf_usage.db", redis_url: Optional[str] = None):
73
+ self.db_path = db_path
74
+ self._local = threading.local()
75
+ self._init_db()
76
+
77
+ self._redis_client = None
78
+ if redis_url and REDIS_AVAILABLE:
79
+ self._redis_client = redis.from_url(redis_url)
80
+ elif redis_url:
81
+ raise ImportError("Redis client not installed. Run: pip install redis")
82
+
83
+ @contextmanager
84
+ def _get_conn(self):
85
+ """Get a thread‑local SQLite connection."""
86
+ if not hasattr(self._local, "conn"):
87
+ self._local.conn = sqlite3.connect(self.db_path, check_same_thread=False)
88
+ self._local.conn.row_factory = sqlite3.Row
89
+ yield self._local.conn
90
+
91
+ def _init_db(self):
92
+ with self._get_conn() as conn:
93
+ conn.execute("""
94
+ CREATE TABLE IF NOT EXISTS api_keys (
95
+ key TEXT PRIMARY KEY,
96
+ tier TEXT NOT NULL,
97
+ created_at REAL NOT NULL,
98
+ last_used_at REAL,
99
+ is_active INTEGER DEFAULT 1
100
+ )
101
+ """)
102
+ conn.execute("""
103
+ CREATE TABLE IF NOT EXISTS usage_log (
104
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
105
+ api_key TEXT NOT NULL,
106
+ tier TEXT NOT NULL,
107
+ timestamp REAL NOT NULL,
108
+ endpoint TEXT NOT NULL,
109
+ request_body TEXT,
110
+ response TEXT,
111
+ error TEXT,
112
+ processing_ms REAL
113
+ )
114
+ """)
115
+ conn.execute("""
116
+ CREATE INDEX IF NOT EXISTS idx_api_key_timestamp
117
+ ON usage_log (api_key, timestamp)
118
+ """)
119
+ conn.execute("""
120
+ CREATE TABLE IF NOT EXISTS monthly_counts (
121
+ api_key TEXT NOT NULL,
122
+ year_month TEXT NOT NULL,
123
+ count INTEGER DEFAULT 0,
124
+ PRIMARY KEY (api_key, year_month)
125
+ )
126
+ """)
127
+ conn.commit()
128
+
129
+ def _get_month_key(self) -> str:
130
+ return datetime.now().strftime("%Y-%m")
131
+
132
+ def get_or_create_api_key(self, key: str, tier: Tier = Tier.FREE) -> bool:
133
+ """Register a new API key. Returns True if key exists or was created."""
134
+ with self._get_conn() as conn:
135
+ row = conn.execute("SELECT key FROM api_keys WHERE key = ?", (key,)).fetchone()
136
+ if row:
137
+ return True
138
+ conn.execute(
139
+ "INSERT INTO api_keys (key, tier, created_at, is_active) VALUES (?, ?, ?, ?)",
140
+ (key, tier.value, time.time(), 1)
141
+ )
142
+ conn.commit()
143
+ return True
144
+
145
+ def get_tier(self, api_key: str) -> Optional[Tier]:
146
+ """Return the tier for a given API key, or None if key invalid/inactive."""
147
+ with self._get_conn() as conn:
148
+ row = conn.execute(
149
+ "SELECT tier FROM api_keys WHERE key = ? AND is_active = 1",
150
+ (api_key,)
151
+ ).fetchone()
152
+ if not row:
153
+ return None
154
+ return Tier(row["tier"])
155
+
156
+ def update_api_key_tier(self, api_key: str, new_tier: Tier) -> bool:
157
+ """Update the tier of an existing API key. Returns True if successful."""
158
+ with self._get_conn() as conn:
159
+ row = conn.execute("SELECT key FROM api_keys WHERE key = ?", (api_key,)).fetchone()
160
+ if not row:
161
+ return False
162
+ conn.execute("UPDATE api_keys SET tier = ? WHERE key = ?", (new_tier.value, api_key))
163
+ conn.commit()
164
+ return True
165
+
166
+ def get_remaining_quota(self, api_key: str, tier: Tier) -> Optional[int]:
167
+ """Return remaining evaluations for the month, or None if unlimited."""
168
+ limit = tier.monthly_evaluation_limit
169
+ if limit is None:
170
+ return None
171
+
172
+ month = self._get_month_key()
173
+ if self._redis_client:
174
+ redis_key = f"arf:quota:{api_key}:{month}"
175
+ count = int(self._redis_client.get(redis_key) or 0)
176
+ return max(0, limit - count)
177
+
178
+ with self._get_conn() as conn:
179
+ row = conn.execute(
180
+ "SELECT count FROM monthly_counts WHERE api_key = ? AND year_month = ?",
181
+ (api_key, month)
182
+ ).fetchone()
183
+ count = row["count"] if row else 0
184
+ return max(0, limit - count)
185
+
186
+ def _increment_quota(self, api_key: str, tier: Tier) -> None:
187
+ """Increment the monthly counter (internal, synchronous)."""
188
+ limit = tier.monthly_evaluation_limit
189
+ if limit is None:
190
+ return
191
+ month = self._get_month_key()
192
+ if self._redis_client:
193
+ redis_key = f"arf:quota:{api_key}:{month}"
194
+ self._redis_client.incr(redis_key)
195
+ self._redis_client.expire(redis_key, timedelta(days=31))
196
+ else:
197
+ with self._get_conn() as conn:
198
+ conn.execute(
199
+ """INSERT INTO monthly_counts (api_key, year_month, count)
200
+ VALUES (?, ?, 1)
201
+ ON CONFLICT(api_key, year_month) DO UPDATE SET count = count + 1""",
202
+ (api_key, month)
203
+ )
204
+ conn.commit()
205
+
206
+ def _insert_audit_log(self, record: UsageRecord) -> None:
207
+ """Insert a single audit log (internal, synchronous)."""
208
+ with self._get_conn() as conn:
209
+ conn.execute(
210
+ """INSERT INTO usage_log
211
+ (api_key, tier, timestamp, endpoint, request_body, response, error, processing_ms)
212
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?)""",
213
+ (
214
+ record.api_key,
215
+ record.tier.value,
216
+ record.timestamp,
217
+ record.endpoint,
218
+ json.dumps(record.request_body) if record.request_body else None,
219
+ json.dumps(record.response) if record.response else None,
220
+ record.error,
221
+ record.processing_ms,
222
+ )
223
+ )
224
+ conn.commit()
225
+
226
+ def increment_usage_sync(self, record: UsageRecord) -> bool:
227
+ """
228
+ Synchronously record usage and increment counter.
229
+ Returns True if within quota (i.e., counter was incremented), False if quota exceeded.
230
+ """
231
+ tier = record.tier
232
+ limit = tier.monthly_evaluation_limit
233
+ if limit is not None:
234
+ remaining = self.get_remaining_quota(record.api_key, tier)
235
+ if remaining <= 0:
236
+ return False
237
+ self._increment_quota(record.api_key, tier)
238
+ self._insert_audit_log(record)
239
+ return True
240
+
241
+ async def increment_usage_async(self, record: UsageRecord, background_tasks: BackgroundTasks) -> bool:
242
+ """
243
+ Asynchronously record usage using FastAPI BackgroundTasks.
244
+ Returns True if quota allows (i.e., will be recorded), False if quota exceeded.
245
+ """
246
+ tier = record.tier
247
+ limit = tier.monthly_evaluation_limit
248
+ if limit is not None:
249
+ remaining = self.get_remaining_quota(record.api_key, tier)
250
+ if remaining <= 0:
251
+ return False
252
+ # Schedule the actual write in the background
253
+ background_tasks.add_task(self._increment_quota, record.api_key, tier)
254
+ background_tasks.add_task(self._insert_audit_log, record)
255
+ return True
256
+
257
+ def get_audit_logs(
258
+ self,
259
+ api_key: str,
260
+ start_date: Optional[datetime] = None,
261
+ end_date: Optional[datetime] = None,
262
+ limit: int = 100,
263
+ ) -> List[Dict[str, Any]]:
264
+ """Retrieve audit logs for a given API key."""
265
+ query = "SELECT * FROM usage_log WHERE api_key = ?"
266
+ params = [api_key]
267
+ if start_date:
268
+ query += " AND timestamp >= ?"
269
+ params.append(start_date.timestamp())
270
+ if end_date:
271
+ query += " AND timestamp <= ?"
272
+ params.append(end_date.timestamp())
273
+ query += " ORDER BY timestamp DESC LIMIT ?"
274
+ params.append(limit)
275
+
276
+ with self._get_conn() as conn:
277
+ rows = conn.execute(query, params).fetchall()
278
+ return [dict(row) for row in rows]
279
+
280
+ def clean_old_logs(self):
281
+ """Delete logs older than retention period for each tier."""
282
+ with self._get_conn() as conn:
283
+ for tier in Tier:
284
+ retention_days = tier.audit_log_retention_days
285
+ if retention_days is None:
286
+ continue
287
+ cutoff = time.time() - retention_days * 86400
288
+ conn.execute(
289
+ "DELETE FROM usage_log WHERE tier = ? AND timestamp < ?",
290
+ (tier.value, cutoff)
291
+ )
292
+ conn.commit()
293
+
294
+
295
+ # Global instance
296
+ tracker: Optional[UsageTracker] = None
297
+
298
+
299
+ def init_tracker(db_path: str = "arf_usage.db", redis_url: Optional[str] = None):
300
+ global tracker
301
+ tracker = UsageTracker(db_path, redis_url)
302
+
303
+
304
+ def update_key_tier(api_key: str, new_tier: Tier) -> bool:
305
+ """Globally accessible helper to update API key tier."""
306
+ if tracker is None:
307
+ return False
308
+ return tracker.update_api_key_tier(api_key, new_tier)
309
+
310
+
311
+ # FastAPI dependency to enforce quota
312
+ from fastapi import HTTPException, Request
313
+
314
+ async def enforce_quota(request: Request, api_key: str = None):
315
+ """
316
+ Dependency that checks API key and remaining quota.
317
+ Use in your endpoint: `quota = Depends(enforce_quota)`
318
+
319
+ If usage tracking is disabled, returns a default dict (no enforcement).
320
+ """
321
+ # If tracker not initialised, allow all requests (fallback)
322
+ if tracker is None:
323
+ return {"api_key": api_key or "disabled", "tier": Tier.FREE, "remaining": None}
324
+
325
+ # Extract API key from header or query
326
+ if api_key is None:
327
+ auth_header = request.headers.get("Authorization")
328
+ if auth_header and auth_header.startswith("Bearer "):
329
+ api_key = auth_header[7:]
330
+ else:
331
+ api_key = request.query_params.get("api_key")
332
+
333
+ if not api_key:
334
+ raise HTTPException(status_code=401, detail="Missing API key")
335
+
336
+ tier = tracker.get_tier(api_key)
337
+ if tier is None:
338
+ raise HTTPException(status_code=403, detail="Invalid or inactive API key")
339
+
340
+ remaining = tracker.get_remaining_quota(api_key, tier)
341
+ if remaining is not None and remaining <= 0:
342
+ raise HTTPException(status_code=429, detail="Monthly evaluation quota exceeded")
343
+
344
+ # Store in request state for later logging
345
+ request.state.api_key = api_key
346
+ request.state.tier = tier
347
+ return {"api_key": api_key, "tier": tier, "remaining": remaining}
app/database/__init__.py ADDED
File without changes
app/database/base.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ from sqlalchemy.ext.declarative import declarative_base
2
+ Base = declarative_base()
app/database/models_intents.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sqlalchemy import Column, Integer, String, DateTime, Boolean, Text, JSON, ForeignKey, UniqueConstraint
2
+ from sqlalchemy.orm import relationship
3
+ import datetime
4
+ from .base import Base
5
+
6
+
7
+ class IntentDB(Base):
8
+ __tablename__ = "intents"
9
+ id = Column(Integer, primary_key=True, index=True)
10
+ deterministic_id = Column(String(64), unique=True, index=True, nullable=False)
11
+ intent_type = Column(String(64), nullable=False)
12
+ payload = Column(JSON, nullable=False)
13
+ oss_payload = Column(JSON, nullable=True)
14
+ environment = Column(String(32), nullable=True)
15
+ created_at = Column(DateTime, default=datetime.datetime.utcnow, nullable=False)
16
+ evaluated_at = Column(DateTime, nullable=True)
17
+ risk_score = Column(String(32), nullable=True)
18
+ outcomes = relationship("OutcomeDB", back_populates="intent", cascade="all, delete-orphan")
19
+
20
+
21
+ class OutcomeDB(Base):
22
+ __tablename__ = "intent_outcomes"
23
+ id = Column(Integer, primary_key=True, index=True)
24
+ intent_id = Column(Integer, ForeignKey("intents.id", ondelete="CASCADE"), nullable=False)
25
+ success = Column(Boolean, nullable=False)
26
+ recorded_by = Column(String(128), nullable=True)
27
+ notes = Column(Text, nullable=True)
28
+ recorded_at = Column(DateTime, default=datetime.datetime.utcnow, nullable=False)
29
+ intent = relationship("IntentDB", back_populates="outcomes")
30
+
31
+ __table_args__ = (
32
+ UniqueConstraint("intent_id", name="uq_outcome_intentid"),
33
+ )
app/database/session.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from sqlalchemy import create_engine
2
+ from sqlalchemy.orm import sessionmaker
3
+ from app.core.config import settings
4
+
5
+ engine = create_engine(settings.database_url)
6
+ SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
app/main.py ADDED
@@ -0,0 +1,257 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ARF API Control Plane - Main Application Entry Point
3
+ With optional heavy dependencies and usage tracking.
4
+ """
5
+ import logging
6
+ import os
7
+ import sys
8
+ import json
9
+ from contextlib import asynccontextmanager
10
+ from typing import Dict
11
+
12
+ from fastapi import FastAPI
13
+ from fastapi.middleware.cors import CORSMiddleware
14
+
15
+ # Optional prometheus
16
+ try:
17
+ from prometheus_fastapi_instrumentator import Instrumentator
18
+ PROMETHEUS_AVAILABLE = True
19
+ except ImportError:
20
+ PROMETHEUS_AVAILABLE = False
21
+ Instrumentator = None
22
+
23
+ # Optional slowapi
24
+ try:
25
+ from slowapi import _rate_limit_exceeded_handler
26
+ from slowapi.errors import RateLimitExceeded
27
+ from slowapi.middleware import SlowAPIMiddleware
28
+ SLOWAPI_AVAILABLE = True
29
+ except ImportError:
30
+ SLOWAPI_AVAILABLE = False
31
+ _rate_limit_exceeded_handler = None
32
+ RateLimitExceeded = None
33
+ SlowAPIMiddleware = None
34
+
35
+ # Optional agentic_reliability_framework (risk engine, policy engine, etc.)
36
+ try:
37
+ from agentic_reliability_framework.core.governance.risk_engine import RiskEngine
38
+ from agentic_reliability_framework.core.governance.policy_engine import PolicyEngine
39
+ from agentic_reliability_framework.runtime.memory import create_faiss_index, RAGGraphMemory
40
+ from agentic_reliability_framework.runtime.memory.constants import MemoryConstants
41
+ ARF_AVAILABLE = True
42
+ except ImportError:
43
+ ARF_AVAILABLE = False
44
+ RiskEngine = None
45
+ PolicyEngine = None
46
+ create_faiss_index = None
47
+ RAGGraphMemory = None
48
+ MemoryConstants = None
49
+
50
+ # ===== USAGE TRACKER =====
51
+ from app.core.usage_tracker import init_tracker, tracker, Tier
52
+
53
+ from app.api import (
54
+ routes_governance,
55
+ routes_history,
56
+ routes_incidents,
57
+ routes_intents,
58
+ routes_risk,
59
+ routes_memory,
60
+ routes_admin,
61
+ routes_payments,
62
+ webhooks,
63
+ routes_users, # <-- ADDED
64
+ )
65
+ from app.api.deps import limiter
66
+ from app.core.config import settings
67
+
68
+ logger = logging.getLogger("arf.api")
69
+ logging.basicConfig(
70
+ level=logging.INFO,
71
+ format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
72
+ handlers=[logging.StreamHandler(sys.stdout)],
73
+ )
74
+
75
+
76
+ @asynccontextmanager
77
+ async def lifespan(app: FastAPI):
78
+ logger.info("🚀 Starting ARF API Control Plane")
79
+ logger.debug(f"Python path: {sys.path}")
80
+
81
+ if ARF_AVAILABLE:
82
+ hmc_model_path = os.getenv("ARF_HMC_MODEL", "models/hmc_model.json")
83
+ use_hyperpriors = os.getenv(
84
+ "ARF_USE_HYPERPRIORS",
85
+ "false").lower() == "true"
86
+ logger.info(
87
+ "Initializing RiskEngine – HMC model: %s, hyperpriors: %s",
88
+ hmc_model_path,
89
+ use_hyperpriors)
90
+ try:
91
+ app.state.risk_engine = RiskEngine(
92
+ hmc_model_path=hmc_model_path,
93
+ use_hyperpriors=use_hyperpriors,
94
+ n0=1000,
95
+ hyperprior_weight=0.3,
96
+ )
97
+ logger.info("✅ RiskEngine initialized successfully.")
98
+ except Exception as e:
99
+ logger.exception("💥 Fatal error initializing RiskEngine")
100
+ raise RuntimeError("RiskEngine initialization failed") from e
101
+
102
+ try:
103
+ app.state.policy_engine = PolicyEngine()
104
+ logger.info("✅ PolicyEngine initialized successfully.")
105
+ except Exception as e:
106
+ logger.warning(f"PolicyEngine initialization failed: {e}")
107
+ app.state.policy_engine = None
108
+
109
+ try:
110
+ faiss_index = create_faiss_index(dim=MemoryConstants.VECTOR_DIM)
111
+ app.state.rag_graph = RAGGraphMemory(faiss_index)
112
+ logger.info("✅ RAGGraphMemory initialized successfully.")
113
+ except Exception as e:
114
+ logger.warning(f"RAGGraphMemory initialization failed: {e}")
115
+ app.state.rag_graph = None
116
+
117
+ epistemic_model_name = os.getenv("EPISTEMIC_MODEL", "")
118
+ if epistemic_model_name:
119
+ try:
120
+ from sentence_transformers import SentenceTransformer
121
+ logger.info(f"Loading epistemic model: {epistemic_model_name}")
122
+ app.state.epistemic_model = SentenceTransformer(
123
+ epistemic_model_name)
124
+ app.state.epistemic_tokenizer = app.state.epistemic_model.tokenizer
125
+ logger.info("✅ Epistemic model loaded.")
126
+ except ImportError:
127
+ logger.warning(
128
+ "sentence-transformers not installed; epistemic signals will be zeros.")
129
+ app.state.epistemic_model = None
130
+ app.state.epistemic_tokenizer = None
131
+ except Exception as e:
132
+ logger.warning(f"Failed to load epistemic model: {e}")
133
+ app.state.epistemic_model = None
134
+ app.state.epistemic_tokenizer = None
135
+ else:
136
+ logger.info(
137
+ "EPISTEMIC_MODEL not set; epistemic signals will be zeros.")
138
+ app.state.epistemic_model = None
139
+ app.state.epistemic_tokenizer = None
140
+ else:
141
+ logger.warning(
142
+ "agentic_reliability_framework not installed; risk engine, policy engine, RAG disabled.")
143
+
144
+ # ===== USAGE TRACKER INITIALISATION =====
145
+ if os.getenv("ARF_USAGE_TRACKING", "false").lower() == "true":
146
+ logger.info("Initialising usage tracker...")
147
+ init_tracker(
148
+ db_path=os.getenv("ARF_USAGE_DB_PATH", "arf_usage.db"),
149
+ redis_url=os.getenv("ARF_REDIS_URL")
150
+ )
151
+ # Seed initial API keys from environment variable (for testing / demo)
152
+ api_keys_json = os.getenv("ARF_API_KEYS", "{}")
153
+ try:
154
+ api_keys = json.loads(api_keys_json)
155
+ for key, tier_str in api_keys.items():
156
+ try:
157
+ tier = Tier(tier_str.lower())
158
+ tracker.get_or_create_api_key(key, tier)
159
+ logger.info(f"Seeded API key for tier {tier.value}")
160
+ except ValueError:
161
+ logger.warning(f"Invalid tier '{tier_str}' for key {key}, skipping")
162
+ except json.JSONDecodeError:
163
+ logger.warning("ARF_API_KEYS environment variable is not valid JSON; skipping seeding.")
164
+ app.state.usage_tracker = tracker
165
+ logger.info("✅ Usage tracker ready.")
166
+ else:
167
+ logger.info("Usage tracking disabled (ARF_USAGE_TRACKING not set to true).")
168
+ app.state.usage_tracker = None
169
+
170
+ yield
171
+ logger.info("🛑 Shutting down ARF API")
172
+
173
+
174
+ def create_app() -> FastAPI:
175
+ app = FastAPI(
176
+ title=settings.app_name,
177
+ version="0.5.0",
178
+ lifespan=lifespan,
179
+ docs_url="/docs",
180
+ redoc_url="/redoc",
181
+ description="Agentic Reliability Framework (ARF) API",
182
+ )
183
+
184
+ allowed_origins = ["https://arf-frontend-sandy.vercel.app"]
185
+ app.add_middleware(
186
+ CORSMiddleware,
187
+ allow_origins=allowed_origins,
188
+ allow_credentials=True,
189
+ allow_methods=["*"],
190
+ allow_headers=["*"],
191
+ )
192
+ logger.debug("CORS middleware configured")
193
+
194
+ if SLOWAPI_AVAILABLE:
195
+ app.state.limiter = limiter
196
+ app.add_exception_handler(
197
+ RateLimitExceeded,
198
+ _rate_limit_exceeded_handler)
199
+ app.add_middleware(SlowAPIMiddleware)
200
+ logger.debug("Rate limiter middleware configured")
201
+ else:
202
+ logger.debug("Rate limiter disabled (slowapi not installed)")
203
+
204
+ if PROMETHEUS_AVAILABLE:
205
+ Instrumentator().instrument(app).expose(app)
206
+ logger.debug("Prometheus instrumentator configured")
207
+ else:
208
+ logger.debug(
209
+ "Prometheus instrumentator disabled (module not installed)")
210
+
211
+ # Include routers
212
+ app.include_router(
213
+ routes_incidents.router,
214
+ prefix="/api/v1",
215
+ tags=["incidents"])
216
+ app.include_router(routes_risk.router, prefix="/api/v1", tags=["risk"])
217
+ app.include_router(
218
+ routes_intents.router,
219
+ prefix="/api/v1",
220
+ tags=["intents"])
221
+ app.include_router(
222
+ routes_history.router,
223
+ prefix="/api/v1",
224
+ tags=["history"])
225
+ app.include_router(
226
+ routes_governance.router,
227
+ prefix="/api/v1",
228
+ tags=["governance"])
229
+ app.include_router(
230
+ routes_memory.router,
231
+ prefix="/v1/memory",
232
+ tags=["memory"])
233
+ app.include_router(
234
+ routes_admin.router,
235
+ prefix="/api/v1",
236
+ tags=["admin"])
237
+ app.include_router(
238
+ routes_payments.router,
239
+ prefix="/api/v1",
240
+ tags=["payments"])
241
+ app.include_router(
242
+ webhooks.router,
243
+ tags=["webhooks"])
244
+ app.include_router(
245
+ routes_users.router, # <-- ADDED
246
+ prefix="/api/v1",
247
+ tags=["users"])
248
+ logger.debug("All API routers included")
249
+
250
+ @app.get("/health", tags=["health"])
251
+ async def health() -> Dict[str, str]:
252
+ return {"status": "ok"}
253
+
254
+ return app
255
+
256
+
257
+ app = create_app()
app/services/__init__.py ADDED
File without changes
app/services/incident_service.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ from agentic_reliability_framework.core.reliability_signal import signal_to_reliability
2
+ from app.models.incident_models import IncidentReport
3
+
4
+
5
+ def process_incident(report: IncidentReport) -> float:
6
+ reliability = signal_to_reliability(report.value, signal_type=report.signal_type)
7
+ return reliability
app/services/intent_adapter.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from agentic_reliability_framework.core.governance.intents import (
2
+ ProvisionResourceIntent,
3
+ GrantAccessIntent,
4
+ DeployConfigurationIntent,
5
+ )
6
+
7
+
8
+ def to_oss_intent(api_request):
9
+ if api_request.intent_type == "provision_resource":
10
+ return ProvisionResourceIntent(
11
+ resource_type=api_request.resource_type,
12
+ region=api_request.region,
13
+ size=api_request.size,
14
+ configuration=api_request.configuration,
15
+ environment=api_request.environment,
16
+ requester=api_request.requester,
17
+ provenance=api_request.provenance,
18
+ )
19
+ elif api_request.intent_type == "grant_access":
20
+ return GrantAccessIntent(
21
+ principal=api_request.principal,
22
+ permission_level=api_request.permission_level,
23
+ resource_scope=api_request.resource_scope,
24
+ justification=api_request.justification,
25
+ requester=api_request.requester,
26
+ provenance=api_request.provenance,
27
+ )
28
+ elif api_request.intent_type == "deploy_config":
29
+ return DeployConfigurationIntent(
30
+ service_name=api_request.service_name,
31
+ change_scope=api_request.change_scope,
32
+ deployment_target=api_request.deployment_target,
33
+ risk_level_hint=api_request.risk_level_hint,
34
+ configuration=api_request.configuration,
35
+ requester=api_request.requester,
36
+ provenance=api_request.provenance,
37
+ )
38
+ else:
39
+ raise ValueError(f"Unknown intent type: {api_request.intent_type}")
app/services/intent_service.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+ import logging
3
+ from app.models.intent_models import IntentSimulation
4
+
5
+ logger = logging.getLogger(__name__)
6
+
7
+
8
+ # Note: This endpoint is deprecated. Use /v1/intents/evaluate instead.
9
+ def simulate_intent(intent: IntentSimulation) -> dict:
10
+ logger.warning("Deprecated endpoint /simulate_intent used. Please migrate to /v1/intents/evaluate.")
11
+ # For backward compatibility, we still use random risk.
12
+ risk_score = random.uniform(0, 1)
13
+ if risk_score < 0.2:
14
+ recommendation = "safe_to_execute"
15
+ elif risk_score < 0.6:
16
+ recommendation = "requires_approval"
17
+ else:
18
+ recommendation = "blocked"
19
+ return {"risk_score": risk_score, "recommendation": recommendation}
app/services/intent_store.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datetime
2
+ from sqlalchemy.orm import Session
3
+ from app.database.models_intents import IntentDB
4
+ from typing import Any, Dict, Optional
5
+
6
+
7
+ def save_evaluated_intent(
8
+ db: Session,
9
+ deterministic_id: str,
10
+ intent_type: str,
11
+ api_payload: Dict[str, Any],
12
+ oss_payload: Dict[str, Any],
13
+ environment: str,
14
+ risk_score: float
15
+ ) -> IntentDB:
16
+ existing = db.query(IntentDB).filter(IntentDB.deterministic_id == deterministic_id).one_or_none()
17
+ if existing:
18
+ existing.evaluated_at = datetime.datetime.utcnow()
19
+ existing.risk_score = str(risk_score)
20
+ existing.oss_payload = oss_payload
21
+ db.add(existing)
22
+ db.commit()
23
+ db.refresh(existing)
24
+ return existing
25
+
26
+ intent = IntentDB(
27
+ deterministic_id=deterministic_id,
28
+ intent_type=intent_type,
29
+ payload=api_payload,
30
+ oss_payload=oss_payload,
31
+ environment=environment,
32
+ evaluated_at=datetime.datetime.utcnow(),
33
+ risk_score=str(risk_score)
34
+ )
35
+ db.add(intent)
36
+ db.commit()
37
+ db.refresh(intent)
38
+ return intent
39
+
40
+
41
+ def get_intent_by_deterministic_id(db: Session, deterministic_id: str) -> Optional[IntentDB]:
42
+ return db.query(IntentDB).filter(IntentDB.deterministic_id == deterministic_id).one_or_none()
app/services/outcome_service.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datetime
2
+ import logging
3
+ from typing import Optional, Dict, Any
4
+
5
+ from sqlalchemy.orm import Session
6
+
7
+ from agentic_reliability_framework.core.governance.risk_engine import RiskEngine
8
+ from agentic_reliability_framework.core.governance.intents import (
9
+ InfrastructureIntent,
10
+ ProvisionResourceIntent,
11
+ GrantAccessIntent,
12
+ DeployConfigurationIntent,
13
+ )
14
+ from app.database.models_intents import IntentDB, OutcomeDB
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ class OutcomeConflictError(Exception):
20
+ pass
21
+
22
+
23
+ def reconstruct_oss_intent_from_json(oss_json: Dict[str, Any]) -> InfrastructureIntent:
24
+ intent_type = oss_json.get("intent_type")
25
+ if intent_type == "provision_resource":
26
+ return ProvisionResourceIntent(**oss_json)
27
+ elif intent_type == "grant_access":
28
+ return GrantAccessIntent(**oss_json)
29
+ elif intent_type == "deploy_config":
30
+ return DeployConfigurationIntent(**oss_json)
31
+ else:
32
+ raise ValueError(
33
+ f"Cannot reconstruct intent from JSON: missing or unknown intent_type {intent_type}"
34
+ )
35
+
36
+
37
+ def _create_dummy_intent(intent_type: str) -> Optional[InfrastructureIntent]:
38
+ """Create a valid dummy intent for a given intent type.
39
+ For now, only ProvisionResourceIntent is fully supported.
40
+ """
41
+ from agentic_reliability_framework.core.governance.intents import (
42
+ ProvisionResourceIntent,
43
+ )
44
+ if intent_type == "ProvisionResourceIntent":
45
+ # Use string values directly; they must be valid according to the model
46
+ return ProvisionResourceIntent(
47
+ resource_type="vm",
48
+ region="eastus",
49
+ size="Standard_D2s_v3",
50
+ environment="dev", # Use string instead of Environment.dev
51
+ requester="system"
52
+ )
53
+ else:
54
+ logger.warning("Dummy intent creation not implemented for %s", intent_type)
55
+ return None
56
+
57
+
58
+ def record_outcome(
59
+ db: Session,
60
+ deterministic_id: str,
61
+ success: bool,
62
+ recorded_by: Optional[str],
63
+ notes: Optional[str],
64
+ risk_engine: RiskEngine
65
+ ) -> OutcomeDB:
66
+ intent = db.query(IntentDB).filter(IntentDB.deterministic_id == deterministic_id).one_or_none()
67
+ if not intent:
68
+ raise ValueError(f"Intent not found: {deterministic_id}")
69
+
70
+ existing_outcome = db.query(OutcomeDB).filter(OutcomeDB.intent_id == intent.id).one_or_none()
71
+ if existing_outcome:
72
+ if existing_outcome.success == success:
73
+ return existing_outcome
74
+ raise OutcomeConflictError("Outcome already recorded with different result")
75
+
76
+ outcome = OutcomeDB(
77
+ intent_id=intent.id,
78
+ success=bool(success),
79
+ recorded_by=recorded_by,
80
+ notes=notes,
81
+ recorded_at=datetime.datetime.utcnow() # will be replaced with timezone-aware later
82
+ )
83
+ db.add(outcome)
84
+ db.commit()
85
+ db.refresh(outcome)
86
+
87
+ # Determine OSS intent for risk engine update
88
+ oss_intent = None
89
+ if intent.oss_payload:
90
+ try:
91
+ oss_intent = reconstruct_oss_intent_from_json(intent.oss_payload)
92
+ except Exception as e:
93
+ logger.warning(
94
+ "Failed to reconstruct OSS intent for %s: %s. Using dummy fallback.",
95
+ deterministic_id, e
96
+ )
97
+ oss_intent = _create_dummy_intent(intent.intent_type)
98
+ else:
99
+ oss_intent = _create_dummy_intent(intent.intent_type)
100
+
101
+ # Update risk engine if we have an intent
102
+ if oss_intent is not None:
103
+ try:
104
+ risk_engine.update_outcome(oss_intent, success)
105
+ except Exception as e:
106
+ logger.exception(
107
+ "Failed to update RiskEngine after recording outcome for intent %s: %s",
108
+ deterministic_id, e
109
+ )
110
+ else:
111
+ logger.error(
112
+ "No valid OSS intent available for risk engine update; skipping outcome for %s",
113
+ deterministic_id
114
+ )
115
+
116
+ return outcome
app/services/risk_service.py ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from agentic_reliability_framework.core.governance.risk_engine import RiskEngine
2
+ from agentic_reliability_framework.core.governance.intents import InfrastructureIntent
3
+ from typing import Optional, List, Dict, Any
4
+
5
+ from agentic_reliability_framework.core.models.event import ReliabilityEvent, HealingAction
6
+ from agentic_reliability_framework.core.governance.policy_engine import PolicyEngine
7
+ from agentic_reliability_framework.core.decision.decision_engine import DecisionEngine
8
+ from agentic_reliability_framework.runtime.memory.rag_graph import RAGGraphMemory
9
+
10
+ # NEW: Import eclipse probe
11
+ from agentic_reliability_framework.core.research.eclipse_probe import compute_epistemic_risk
12
+
13
+
14
+ def evaluate_intent(
15
+ engine: RiskEngine,
16
+ intent: InfrastructureIntent,
17
+ cost_estimate: Optional[float],
18
+ policy_violations: List[str]
19
+ ) -> dict:
20
+ """
21
+ Evaluate an infrastructure intent using the Bayesian risk engine.
22
+ Returns a dictionary with risk score, explanation, and contributions.
23
+ """
24
+ score, explanation, contributions = engine.calculate_risk(
25
+ intent=intent,
26
+ cost_estimate=cost_estimate,
27
+ policy_violations=policy_violations
28
+ )
29
+ return {
30
+ "risk_score": score,
31
+ "explanation": explanation,
32
+ "contributions": contributions
33
+ }
34
+
35
+
36
+ def evaluate_healing_decision(
37
+ event: ReliabilityEvent,
38
+ policy_engine: PolicyEngine,
39
+ decision_engine: Optional[DecisionEngine] = None,
40
+ rag_graph: Optional[RAGGraphMemory] = None,
41
+ model=None, # NEW: optional HuggingFace model
42
+ tokenizer=None, # NEW: optional tokenizer
43
+ ) -> Dict[str, Any]:
44
+ """
45
+ Evaluate healing actions for a given reliability event using decision‑theoretic selection.
46
+ Now includes epistemic risk signals from the eclipse probe.
47
+
48
+ Returns:
49
+ Dictionary with keys: risk_score, selected_action, expected_utility, alternatives,
50
+ explanation, epistemic_signals (new).
51
+ """
52
+ # If decision_engine not provided, try to get from policy_engine
53
+ if decision_engine is None and hasattr(policy_engine, 'decision_engine'):
54
+ decision_engine = policy_engine.decision_engine
55
+
56
+ # If still None, create a minimal one (global stats only)
57
+ if decision_engine is None:
58
+ decision_engine = DecisionEngine(rag_graph=rag_graph)
59
+
60
+ # Get raw candidate actions (by temporarily disabling decision engine)
61
+ orig_use = policy_engine.use_decision_engine
62
+ try:
63
+ policy_engine.use_decision_engine = False
64
+ raw_actions = policy_engine.evaluate_policies(event)
65
+ finally:
66
+ policy_engine.use_decision_engine = orig_use
67
+
68
+ # If no actions, return NO_ACTION
69
+ if not raw_actions or raw_actions == [HealingAction.NO_ACTION]:
70
+ return {
71
+ "risk_score": 0.0,
72
+ "selected_action": HealingAction.NO_ACTION.value,
73
+ "expected_utility": 0.0,
74
+ "alternatives": [],
75
+ "explanation": "No candidate actions triggered.",
76
+ "epistemic_signals": None,
77
+ }
78
+
79
+ # === NEW: Compute epistemic signals from triggered policies ===
80
+ # Build reasoning text from the policies that triggered the actions
81
+ reasoning_parts = []
82
+ for policy in policy_engine.policies:
83
+ # Check if any of the policy's actions are in raw_actions
84
+ if any(a in policy.actions for a in raw_actions):
85
+ conditions_str = ", ".join(
86
+ f"{c.metric} {c.operator} {c.threshold}" for c in policy.conditions
87
+ )
88
+ reasoning_parts.append(
89
+ f"Policy {policy.name} triggered by {conditions_str} → actions {[a.value for a in policy.actions]}"
90
+ )
91
+ reasoning_text = " ".join(reasoning_parts)
92
+
93
+ # Build evidence text from the event
94
+ evidence_text = (
95
+ f"Component: {event.component}, "
96
+ f"latency_p99: {event.latency_p99}, "
97
+ f"error_rate: {event.error_rate}, "
98
+ f"cpu_util: {event.cpu_util}, "
99
+ f"memory_util: {event.memory_util}"
100
+ )
101
+
102
+ # Compute epistemic signals (if model/tokenizer provided)
103
+ epistemic_signals = None
104
+ if model is not None and tokenizer is not None:
105
+ epistemic_signals = compute_epistemic_risk(
106
+ reasoning_text, evidence_text, model, tokenizer
107
+ )
108
+ else:
109
+ # In OSS, we may not have model; use zeros as fallback
110
+ epistemic_signals = {
111
+ "entropy": 0.0,
112
+ "contradiction": 0.0,
113
+ "evidence_lift": 0.0,
114
+ "hallucination_risk": 0.0,
115
+ }
116
+
117
+ # Run decision engine to get best action and alternatives, passing epistemic signals
118
+ decision = decision_engine.select_optimal_action(
119
+ raw_actions, event, component=event.component,
120
+ epistemic_signals=epistemic_signals
121
+ )
122
+
123
+ # Risk of the selected action
124
+ risk_score = None
125
+ for alt in decision.alternatives:
126
+ if alt.action == decision.best_action:
127
+ risk_score = alt.risk
128
+ break
129
+ if risk_score is None:
130
+ # Compute risk separately
131
+ risk_score = decision_engine.compute_risk(decision.best_action, event, event.component)
132
+
133
+ # Format alternatives (top 3 only)
134
+ alt_list = []
135
+ for alt in decision.alternatives[:3]:
136
+ alt_list.append({
137
+ "action": alt.action.value,
138
+ "expected_utility": alt.utility,
139
+ "risk": alt.risk,
140
+ })
141
+
142
+ # Build final response
143
+ response = {
144
+ "risk_score": risk_score,
145
+ "selected_action": decision.best_action.value,
146
+ "expected_utility": decision.expected_utility,
147
+ "alternatives": alt_list,
148
+ "explanation": decision.explanation,
149
+ "raw_decision": decision.raw_data,
150
+ "epistemic_signals": epistemic_signals, # NEW
151
+ }
152
+ return response
153
+
154
+
155
+ def get_system_risk() -> float:
156
+ # Placeholder – this endpoint is being deprecated; we keep it for backward compatibility.
157
+ import random
158
+ return round(random.uniform(0, 1), 2)