Add FastAPI app
Browse files- app/__init__.py +0 -0
- app/api/__init__.py +0 -0
- app/api/deps.py +104 -0
- app/api/routes_admin.py +123 -0
- app/api/routes_governance.py +189 -0
- app/api/routes_history.py +9 -0
- app/api/routes_incidents.py +131 -0
- app/api/routes_intents.py +14 -0
- app/api/routes_memory.py +24 -0
- app/api/routes_payments.py +54 -0
- app/api/routes_risk.py +40 -0
- app/api/routes_users.py +56 -0
- app/api/webhooks.py +48 -0
- app/causal_explainer.py +164 -0
- app/core/__init__.py +0 -0
- app/core/config.py +23 -0
- app/core/storage.py +2 -0
- app/core/usage_tracker.py +347 -0
- app/database/__init__.py +0 -0
- app/database/base.py +2 -0
- app/database/models_intents.py +33 -0
- app/database/session.py +6 -0
- app/main.py +257 -0
- app/services/__init__.py +0 -0
- app/services/incident_service.py +7 -0
- app/services/intent_adapter.py +39 -0
- app/services/intent_service.py +19 -0
- app/services/intent_store.py +42 -0
- app/services/outcome_service.py +116 -0
- app/services/risk_service.py +158 -0
app/__init__.py
ADDED
|
File without changes
|
app/api/__init__.py
ADDED
|
File without changes
|
app/api/deps.py
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
from app.database.session import SessionLocal
|
| 3 |
+
from slowapi import Limiter
|
| 4 |
+
from slowapi.util import get_remote_address
|
| 5 |
+
from app.core.config import settings
|
| 6 |
+
|
| 7 |
+
# ARF core engine imports
|
| 8 |
+
from agentic_reliability_framework.core.governance.risk_engine import RiskEngine
|
| 9 |
+
from agentic_reliability_framework.core.decision.decision_engine import DecisionEngine
|
| 10 |
+
from agentic_reliability_framework.core.governance.stability_controller import LyapunovStabilityController
|
| 11 |
+
from agentic_reliability_framework.core.governance.causal_explainer import CausalExplainer
|
| 12 |
+
from agentic_reliability_framework.runtime.memory.rag_graph import RAGGraphMemory
|
| 13 |
+
from agentic_reliability_framework.core.models.event import ReliabilityEvent, HealingAction
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
# Dependency to get DB session
|
| 17 |
+
def get_db():
|
| 18 |
+
db = SessionLocal()
|
| 19 |
+
try:
|
| 20 |
+
yield db
|
| 21 |
+
finally:
|
| 22 |
+
db.close()
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
# Rate limiter with default limit from settings
|
| 26 |
+
limiter = Limiter(key_func=get_remote_address, default_limits=[settings.RATE_LIMIT])
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
# ARF engine dependencies (singletons for simplicity)
|
| 30 |
+
_risk_engine = None
|
| 31 |
+
_decision_engine = None
|
| 32 |
+
_stability_controller = None
|
| 33 |
+
_causal_explainer = None
|
| 34 |
+
_rag_graph = None
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def _seed_rag_graph(rag):
|
| 38 |
+
"""Seed the RAG graph with historical healing action outcomes."""
|
| 39 |
+
seed_data = [
|
| 40 |
+
("seed_restart_1", "test", HealingAction.RESTART_CONTAINER.value, True, 2),
|
| 41 |
+
("seed_restart_2", "test", HealingAction.RESTART_CONTAINER.value, True, 3),
|
| 42 |
+
("seed_restart_3", "test", HealingAction.RESTART_CONTAINER.value, False, 10),
|
| 43 |
+
("seed_rollback_1", "test", HealingAction.ROLLBACK.value, True, 1),
|
| 44 |
+
("seed_rollback_2", "test", HealingAction.ROLLBACK.value, True, 2),
|
| 45 |
+
("seed_rollback_3", "test", HealingAction.ROLLBACK.value, False, 5),
|
| 46 |
+
("seed_scale_1", "test", HealingAction.SCALE_OUT.value, True, 5),
|
| 47 |
+
("seed_scale_2", "test", HealingAction.SCALE_OUT.value, False, 15),
|
| 48 |
+
("seed_cb_1", "test", HealingAction.CIRCUIT_BREAKER.value, True, 1),
|
| 49 |
+
("seed_cb_2", "test", HealingAction.CIRCUIT_BREAKER.value, True, 2),
|
| 50 |
+
("seed_ts_1", "test", HealingAction.TRAFFIC_SHIFT.value, True, 4),
|
| 51 |
+
("seed_ts_2", "test", HealingAction.TRAFFIC_SHIFT.value, False, 8),
|
| 52 |
+
]
|
| 53 |
+
for inc_id, comp, action, success, res_time in seed_data:
|
| 54 |
+
event = ReliabilityEvent(
|
| 55 |
+
component=comp,
|
| 56 |
+
latency_p99=500,
|
| 57 |
+
error_rate=0.1,
|
| 58 |
+
service_mesh="default"
|
| 59 |
+
)
|
| 60 |
+
rag.record_outcome(
|
| 61 |
+
incident_id=inc_id,
|
| 62 |
+
event=event,
|
| 63 |
+
action_taken=action,
|
| 64 |
+
success=success,
|
| 65 |
+
resolution_time_minutes=res_time
|
| 66 |
+
)
|
| 67 |
+
print("Seeded RAG graph with historical data", file=sys.stderr)
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
def get_rag_graph():
|
| 71 |
+
global _rag_graph
|
| 72 |
+
if _rag_graph is None:
|
| 73 |
+
_rag_graph = RAGGraphMemory()
|
| 74 |
+
_seed_rag_graph(_rag_graph)
|
| 75 |
+
return _rag_graph
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def get_decision_engine():
|
| 79 |
+
global _decision_engine
|
| 80 |
+
if _decision_engine is None:
|
| 81 |
+
rag = get_rag_graph()
|
| 82 |
+
_decision_engine = DecisionEngine(rag_graph=rag)
|
| 83 |
+
return _decision_engine
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
def get_risk_engine():
|
| 87 |
+
global _risk_engine
|
| 88 |
+
if _risk_engine is None:
|
| 89 |
+
_risk_engine = RiskEngine()
|
| 90 |
+
return _risk_engine
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
def get_stability_controller():
|
| 94 |
+
global _stability_controller
|
| 95 |
+
if _stability_controller is None:
|
| 96 |
+
_stability_controller = LyapunovStabilityController()
|
| 97 |
+
return _stability_controller
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
def get_causal_explainer():
|
| 101 |
+
global _causal_explainer
|
| 102 |
+
if _causal_explainer is None:
|
| 103 |
+
_causal_explainer = CausalExplainer()
|
| 104 |
+
return _causal_explainer
|
app/api/routes_admin.py
ADDED
|
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Admin API endpoints for API key management and audit logs.
|
| 3 |
+
These endpoints should be protected (e.g., by an admin API key) in production.
|
| 4 |
+
"""
|
| 5 |
+
from fastapi import APIRouter, Depends, HTTPException, Query, Path, Body
|
| 6 |
+
from pydantic import BaseModel
|
| 7 |
+
from typing import Optional, List, Dict, Any
|
| 8 |
+
from datetime import datetime
|
| 9 |
+
import uuid
|
| 10 |
+
|
| 11 |
+
from app.core.usage_tracker import tracker, Tier
|
| 12 |
+
|
| 13 |
+
router = APIRouter(prefix="/admin", tags=["admin"])
|
| 14 |
+
|
| 15 |
+
# Simple in‑memory admin key (replace with proper auth in production)
|
| 16 |
+
ADMIN_API_KEY = "admin_secret_change_me"
|
| 17 |
+
|
| 18 |
+
def verify_admin(admin_key: str = Query(..., alias="admin_key")):
|
| 19 |
+
if admin_key != ADMIN_API_KEY:
|
| 20 |
+
raise HTTPException(status_code=403, detail="Invalid admin key")
|
| 21 |
+
return True
|
| 22 |
+
|
| 23 |
+
class CreateKeyRequest(BaseModel):
|
| 24 |
+
tier: str
|
| 25 |
+
|
| 26 |
+
class UpdateTierRequest(BaseModel):
|
| 27 |
+
tier: str
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
@router.post("/keys", dependencies=[Depends(verify_admin)])
|
| 31 |
+
async def create_api_key(req: CreateKeyRequest):
|
| 32 |
+
if req.tier not in [t.value for t in Tier]:
|
| 33 |
+
raise HTTPException(status_code=400, detail=f"Invalid tier. Must be one of {[t.value for t in Tier]}")
|
| 34 |
+
new_key = f"sk_live_{uuid.uuid4().hex[:24]}"
|
| 35 |
+
tier_enum = Tier(req.tier)
|
| 36 |
+
tracker.get_or_create_api_key(new_key, tier_enum)
|
| 37 |
+
return {"api_key": new_key, "tier": req.tier}
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
@router.get("/keys", dependencies=[Depends(verify_admin)])
|
| 41 |
+
async def list_api_keys(limit: int = 100, offset: int = 0):
|
| 42 |
+
with tracker._get_conn() as conn:
|
| 43 |
+
rows = conn.execute(
|
| 44 |
+
"SELECT key, tier, created_at, last_used_at, is_active FROM api_keys ORDER BY created_at DESC LIMIT ? OFFSET ?",
|
| 45 |
+
(limit, offset)
|
| 46 |
+
).fetchall()
|
| 47 |
+
keys = []
|
| 48 |
+
for row in rows:
|
| 49 |
+
month = tracker._get_month_key()
|
| 50 |
+
usage_row = conn.execute(
|
| 51 |
+
"SELECT count FROM monthly_counts WHERE api_key = ? AND year_month = ?",
|
| 52 |
+
(row["key"], month)
|
| 53 |
+
).fetchone()
|
| 54 |
+
usage = usage_row["count"] if usage_row else 0
|
| 55 |
+
keys.append({
|
| 56 |
+
"key": row["key"],
|
| 57 |
+
"tier": row["tier"],
|
| 58 |
+
"created_at": datetime.fromtimestamp(row["created_at"]).isoformat(),
|
| 59 |
+
"last_used_at": datetime.fromtimestamp(row["last_used_at"]).isoformat() if row["last_used_at"] else None,
|
| 60 |
+
"is_active": bool(row["is_active"]),
|
| 61 |
+
"current_month_usage": usage,
|
| 62 |
+
})
|
| 63 |
+
return {"keys": keys, "total": len(keys)}
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
@router.patch("/keys/{api_key}/tier", dependencies=[Depends(verify_admin)])
|
| 67 |
+
async def update_key_tier(
|
| 68 |
+
api_key: str = Path(..., description="The API key to update"),
|
| 69 |
+
req: UpdateTierRequest = Body(...),
|
| 70 |
+
):
|
| 71 |
+
if req.tier not in [t.value for t in Tier]:
|
| 72 |
+
raise HTTPException(status_code=400, detail=f"Invalid tier. Must be one of {[t.value for t in Tier]}")
|
| 73 |
+
with tracker._get_conn() as conn:
|
| 74 |
+
row = conn.execute("SELECT key FROM api_keys WHERE key = ?", (api_key,)).fetchone()
|
| 75 |
+
if not row:
|
| 76 |
+
raise HTTPException(status_code=404, detail="API key not found")
|
| 77 |
+
conn.execute("UPDATE api_keys SET tier = ? WHERE key = ?", (req.tier, api_key))
|
| 78 |
+
conn.commit()
|
| 79 |
+
return {"message": f"Tier updated to {req.tier}"}
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
@router.delete("/keys/{api_key}", dependencies=[Depends(verify_admin)])
|
| 83 |
+
async def deactivate_api_key(api_key: str = Path(..., description="The API key to deactivate")):
|
| 84 |
+
with tracker._get_conn() as conn:
|
| 85 |
+
row = conn.execute("SELECT key FROM api_keys WHERE key = ?", (api_key,)).fetchone()
|
| 86 |
+
if not row:
|
| 87 |
+
raise HTTPException(status_code=404, detail="API key not found")
|
| 88 |
+
conn.execute("UPDATE api_keys SET is_active = 0 WHERE key = ?", (api_key,))
|
| 89 |
+
conn.commit()
|
| 90 |
+
return {"message": "API key deactivated"}
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
@router.get("/audit/{api_key}", dependencies=[Depends(verify_admin)])
|
| 94 |
+
async def get_audit_logs(
|
| 95 |
+
api_key: str = Path(..., description="The API key to audit"),
|
| 96 |
+
start_date: Optional[str] = Query(None),
|
| 97 |
+
end_date: Optional[str] = Query(None),
|
| 98 |
+
limit: int = 100,
|
| 99 |
+
):
|
| 100 |
+
start = datetime.fromisoformat(start_date) if start_date else None
|
| 101 |
+
end = datetime.fromisoformat(end_date) if end_date else None
|
| 102 |
+
logs = tracker.get_audit_logs(api_key, start, end, limit)
|
| 103 |
+
return {"api_key": api_key, "logs": logs}
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
@router.get("/stats", dependencies=[Depends(verify_admin)])
|
| 107 |
+
async def get_global_stats():
|
| 108 |
+
with tracker._get_conn() as conn:
|
| 109 |
+
total_keys = conn.execute("SELECT COUNT(*) FROM api_keys WHERE is_active = 1").fetchone()[0]
|
| 110 |
+
total_requests = conn.execute("SELECT COUNT(*) FROM usage_log").fetchone()[0]
|
| 111 |
+
by_tier = conn.execute(
|
| 112 |
+
"SELECT tier, COUNT(*) as count FROM usage_log GROUP BY tier"
|
| 113 |
+
).fetchall()
|
| 114 |
+
month = tracker._get_month_key()
|
| 115 |
+
current_month_requests = conn.execute(
|
| 116 |
+
"SELECT SUM(count) FROM monthly_counts WHERE year_month = ?", (month,)
|
| 117 |
+
).fetchone()[0] or 0
|
| 118 |
+
return {
|
| 119 |
+
"active_api_keys": total_keys,
|
| 120 |
+
"total_evaluations": total_requests,
|
| 121 |
+
"current_month_evaluations": current_month_requests,
|
| 122 |
+
"by_tier": [{"tier": row[0], "count": row[1]} for row in by_tier],
|
| 123 |
+
}
|
app/api/routes_governance.py
ADDED
|
@@ -0,0 +1,189 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import APIRouter, Depends, HTTPException, Request, BackgroundTasks
|
| 2 |
+
from fastapi.encoders import jsonable_encoder
|
| 3 |
+
from sqlalchemy.orm import Session
|
| 4 |
+
from app.models.infrastructure_intents import InfrastructureIntentRequest
|
| 5 |
+
from app.services.intent_adapter import to_oss_intent
|
| 6 |
+
from app.services.risk_service import evaluate_intent, evaluate_healing_decision
|
| 7 |
+
from app.services.intent_store import save_evaluated_intent
|
| 8 |
+
from app.services.outcome_service import record_outcome
|
| 9 |
+
from app.api.deps import get_db
|
| 10 |
+
from pydantic import BaseModel
|
| 11 |
+
import uuid
|
| 12 |
+
import logging
|
| 13 |
+
import time
|
| 14 |
+
|
| 15 |
+
from agentic_reliability_framework.core.models.event import ReliabilityEvent
|
| 16 |
+
|
| 17 |
+
# ===== USAGE TRACKER IMPORTS =====
|
| 18 |
+
from app.core.usage_tracker import enforce_quota, UsageRecord, tracker
|
| 19 |
+
|
| 20 |
+
logger = logging.getLogger(__name__)
|
| 21 |
+
router = APIRouter()
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
class OutcomeRequest(BaseModel):
|
| 25 |
+
deterministic_id: str
|
| 26 |
+
success: bool
|
| 27 |
+
recorded_by: str
|
| 28 |
+
notes: str = ""
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
class HealingDecisionRequest(BaseModel):
|
| 32 |
+
event: ReliabilityEvent
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
@router.post("/intents/evaluate")
|
| 36 |
+
async def evaluate_intent_endpoint(
|
| 37 |
+
request: Request,
|
| 38 |
+
intent_req: InfrastructureIntentRequest,
|
| 39 |
+
background_tasks: BackgroundTasks,
|
| 40 |
+
db: Session = Depends(get_db),
|
| 41 |
+
quota: dict = Depends(enforce_quota)
|
| 42 |
+
):
|
| 43 |
+
start_time = time.time()
|
| 44 |
+
api_key = quota["api_key"]
|
| 45 |
+
tier = quota["tier"]
|
| 46 |
+
response_data = None
|
| 47 |
+
error_msg = None
|
| 48 |
+
|
| 49 |
+
try:
|
| 50 |
+
oss_intent = to_oss_intent(intent_req)
|
| 51 |
+
risk_engine = request.app.state.risk_engine
|
| 52 |
+
result = evaluate_intent(
|
| 53 |
+
engine=risk_engine,
|
| 54 |
+
intent=oss_intent,
|
| 55 |
+
cost_estimate=intent_req.estimated_cost,
|
| 56 |
+
policy_violations=intent_req.policy_violations
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
deterministic_id = str(uuid.uuid4())
|
| 60 |
+
api_payload = jsonable_encoder(intent_req.model_dump())
|
| 61 |
+
oss_payload = jsonable_encoder(oss_intent.model_dump())
|
| 62 |
+
|
| 63 |
+
save_evaluated_intent(
|
| 64 |
+
db=db,
|
| 65 |
+
deterministic_id=deterministic_id,
|
| 66 |
+
intent_type=intent_req.intent_type,
|
| 67 |
+
api_payload=api_payload,
|
| 68 |
+
oss_payload=oss_payload,
|
| 69 |
+
environment=str(intent_req.environment),
|
| 70 |
+
risk_score=result["risk_score"]
|
| 71 |
+
)
|
| 72 |
+
|
| 73 |
+
result["intent_id"] = deterministic_id
|
| 74 |
+
response_data = result
|
| 75 |
+
|
| 76 |
+
if tracker:
|
| 77 |
+
record = UsageRecord(
|
| 78 |
+
api_key=api_key,
|
| 79 |
+
tier=tier,
|
| 80 |
+
timestamp=time.time(),
|
| 81 |
+
endpoint="/api/v1/intents/evaluate",
|
| 82 |
+
request_body=intent_req.model_dump(),
|
| 83 |
+
response=response_data,
|
| 84 |
+
processing_ms=(time.time() - start_time) * 1000,
|
| 85 |
+
)
|
| 86 |
+
await tracker.increment_usage_async(record, background_tasks)
|
| 87 |
+
|
| 88 |
+
return response_data
|
| 89 |
+
|
| 90 |
+
except HTTPException:
|
| 91 |
+
raise
|
| 92 |
+
except Exception as e:
|
| 93 |
+
error_msg = str(e)
|
| 94 |
+
logger.exception("Error in evaluate_intent_endpoint")
|
| 95 |
+
if tracker:
|
| 96 |
+
record = UsageRecord(
|
| 97 |
+
api_key=api_key,
|
| 98 |
+
tier=tier,
|
| 99 |
+
timestamp=time.time(),
|
| 100 |
+
endpoint="/api/v1/intents/evaluate",
|
| 101 |
+
request_body=intent_req.model_dump(),
|
| 102 |
+
error=error_msg,
|
| 103 |
+
processing_ms=(time.time() - start_time) * 1000,
|
| 104 |
+
)
|
| 105 |
+
await tracker.increment_usage_async(record, background_tasks)
|
| 106 |
+
raise HTTPException(status_code=500, detail=error_msg)
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
@router.post("/intents/outcome")
|
| 110 |
+
async def record_outcome_endpoint(
|
| 111 |
+
request: Request,
|
| 112 |
+
outcome: OutcomeRequest,
|
| 113 |
+
db: Session = Depends(get_db)
|
| 114 |
+
):
|
| 115 |
+
# No usage tracking for outcomes (doesn't count against quota)
|
| 116 |
+
try:
|
| 117 |
+
risk_engine = request.app.state.risk_engine
|
| 118 |
+
outcome_record = record_outcome(
|
| 119 |
+
db=db,
|
| 120 |
+
deterministic_id=outcome.deterministic_id,
|
| 121 |
+
success=outcome.success,
|
| 122 |
+
recorded_by=outcome.recorded_by,
|
| 123 |
+
notes=outcome.notes,
|
| 124 |
+
risk_engine=risk_engine
|
| 125 |
+
)
|
| 126 |
+
return {"message": "Outcome recorded", "outcome_id": outcome_record.id}
|
| 127 |
+
except Exception as e:
|
| 128 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
@router.post("/healing/evaluate")
|
| 132 |
+
async def evaluate_healing_decision_endpoint(
|
| 133 |
+
request: Request,
|
| 134 |
+
decision_req: HealingDecisionRequest,
|
| 135 |
+
background_tasks: BackgroundTasks,
|
| 136 |
+
quota: dict = Depends(enforce_quota)
|
| 137 |
+
):
|
| 138 |
+
start_time = time.time()
|
| 139 |
+
api_key = quota["api_key"]
|
| 140 |
+
tier = quota["tier"]
|
| 141 |
+
response_data = None
|
| 142 |
+
error_msg = None
|
| 143 |
+
|
| 144 |
+
try:
|
| 145 |
+
policy_engine = request.app.state.policy_engine
|
| 146 |
+
rag_graph = getattr(request.app.state, "rag_graph", None)
|
| 147 |
+
model = getattr(request.app.state, "epistemic_model", None)
|
| 148 |
+
tokenizer = getattr(request.app.state, "epistemic_tokenizer", None)
|
| 149 |
+
|
| 150 |
+
response_data = evaluate_healing_decision(
|
| 151 |
+
event=decision_req.event,
|
| 152 |
+
policy_engine=policy_engine,
|
| 153 |
+
decision_engine=None,
|
| 154 |
+
rag_graph=rag_graph,
|
| 155 |
+
model=model,
|
| 156 |
+
tokenizer=tokenizer,
|
| 157 |
+
)
|
| 158 |
+
|
| 159 |
+
if tracker:
|
| 160 |
+
record = UsageRecord(
|
| 161 |
+
api_key=api_key,
|
| 162 |
+
tier=tier,
|
| 163 |
+
timestamp=time.time(),
|
| 164 |
+
endpoint="/api/v1/healing/evaluate",
|
| 165 |
+
request_body=decision_req.model_dump(),
|
| 166 |
+
response=response_data,
|
| 167 |
+
processing_ms=(time.time() - start_time) * 1000,
|
| 168 |
+
)
|
| 169 |
+
await tracker.increment_usage_async(record, background_tasks)
|
| 170 |
+
|
| 171 |
+
return response_data
|
| 172 |
+
|
| 173 |
+
except HTTPException:
|
| 174 |
+
raise
|
| 175 |
+
except Exception as e:
|
| 176 |
+
error_msg = str(e)
|
| 177 |
+
logger.exception("Error in evaluate_healing_decision_endpoint")
|
| 178 |
+
if tracker:
|
| 179 |
+
record = UsageRecord(
|
| 180 |
+
api_key=api_key,
|
| 181 |
+
tier=tier,
|
| 182 |
+
timestamp=time.time(),
|
| 183 |
+
endpoint="/api/v1/healing/evaluate",
|
| 184 |
+
request_body=decision_req.model_dump(),
|
| 185 |
+
error=error_msg,
|
| 186 |
+
processing_ms=(time.time() - start_time) * 1000,
|
| 187 |
+
)
|
| 188 |
+
await tracker.increment_usage_async(record, background_tasks)
|
| 189 |
+
raise HTTPException(status_code=500, detail=error_msg)
|
app/api/routes_history.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import APIRouter
|
| 2 |
+
from app.core.storage import incident_history
|
| 3 |
+
|
| 4 |
+
router = APIRouter()
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
@router.get("/history")
|
| 8 |
+
async def get_history():
|
| 9 |
+
return {"incidents": incident_history}
|
app/api/routes_incidents.py
ADDED
|
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from app.causal_explainer import CausalExplainer
|
| 2 |
+
from fastapi import APIRouter, Depends, Request, BackgroundTasks, HTTPException
|
| 3 |
+
from pydantic import BaseModel
|
| 4 |
+
from typing import Optional
|
| 5 |
+
from enum import Enum
|
| 6 |
+
import time
|
| 7 |
+
import json
|
| 8 |
+
|
| 9 |
+
# ===== USAGE TRACKER IMPORTS =====
|
| 10 |
+
from app.core.usage_tracker import enforce_quota, UsageRecord, tracker
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class HealingAction(str, Enum):
|
| 14 |
+
NO_ACTION = "no_action"
|
| 15 |
+
RESTART_CONTAINER = "restart_container"
|
| 16 |
+
SCALE_OUT = "scale_out"
|
| 17 |
+
ROLLBACK = "rollback"
|
| 18 |
+
CIRCUIT_BREAKER = "circuit_breaker"
|
| 19 |
+
TRAFFIC_SHIFT = "traffic_shift"
|
| 20 |
+
ALERT_TEAM = "alert_team"
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
class ReliabilityEvent(BaseModel):
|
| 24 |
+
component: str
|
| 25 |
+
latency_p99: float
|
| 26 |
+
error_rate: float
|
| 27 |
+
service_mesh: str = "default"
|
| 28 |
+
cpu_util: Optional[float] = None
|
| 29 |
+
memory_util: Optional[float] = None
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
router = APIRouter()
|
| 33 |
+
incident_history = []
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
@router.post("/report_incident")
|
| 37 |
+
async def report_incident(event: ReliabilityEvent):
|
| 38 |
+
incident_history.append(event.dict())
|
| 39 |
+
return {"status": "recorded"}
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
@router.post("/v1/incidents/evaluate")
|
| 43 |
+
async def evaluate_incident(
|
| 44 |
+
request: Request,
|
| 45 |
+
event: ReliabilityEvent,
|
| 46 |
+
background_tasks: BackgroundTasks,
|
| 47 |
+
quota: dict = Depends(enforce_quota)
|
| 48 |
+
):
|
| 49 |
+
start_time = time.time()
|
| 50 |
+
api_key = quota["api_key"]
|
| 51 |
+
tier = quota["tier"]
|
| 52 |
+
response_data = None
|
| 53 |
+
error_msg = None
|
| 54 |
+
|
| 55 |
+
try:
|
| 56 |
+
# Simple risk score (heuristic)
|
| 57 |
+
risk_score = min(1.0, (event.latency_p99 / 1000.0) * 0.7 + event.error_rate * 0.3)
|
| 58 |
+
|
| 59 |
+
if event.latency_p99 > 500 or event.error_rate > 0.15:
|
| 60 |
+
optimal_action = HealingAction.RESTART_CONTAINER
|
| 61 |
+
else:
|
| 62 |
+
optimal_action = HealingAction.NO_ACTION
|
| 63 |
+
|
| 64 |
+
current_state = {
|
| 65 |
+
"latency": event.latency_p99,
|
| 66 |
+
"error_rate": event.error_rate,
|
| 67 |
+
"last_action": {"action_type": "no_action"}
|
| 68 |
+
}
|
| 69 |
+
proposed_action = {"action_type": optimal_action.value, "params": {}}
|
| 70 |
+
ce = CausalExplainer()
|
| 71 |
+
causal_exp = ce.explain_healing_intent(proposed_action, current_state, "latency")
|
| 72 |
+
|
| 73 |
+
healing_intent = {
|
| 74 |
+
"action": optimal_action.value,
|
| 75 |
+
"component": event.component,
|
| 76 |
+
"parameters": proposed_action["params"],
|
| 77 |
+
"justification": f"Causal: {causal_exp.explanation_text}",
|
| 78 |
+
"confidence": 0.85,
|
| 79 |
+
"risk_score": risk_score,
|
| 80 |
+
"status": "oss_advisory_only"
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
response_data = {
|
| 84 |
+
"healing_intent": healing_intent,
|
| 85 |
+
"causal_explanation": {
|
| 86 |
+
"factual_outcome": causal_exp.factual_outcome,
|
| 87 |
+
"counterfactual_outcome": causal_exp.counterfactual_outcome,
|
| 88 |
+
"effect": causal_exp.effect,
|
| 89 |
+
"explanation_text": causal_exp.explanation_text,
|
| 90 |
+
"is_model_based": causal_exp.is_model_based,
|
| 91 |
+
"warnings": causal_exp.warnings
|
| 92 |
+
},
|
| 93 |
+
"utility_decision": {
|
| 94 |
+
"best_action": optimal_action.value,
|
| 95 |
+
"expected_utility": 0.5,
|
| 96 |
+
"explanation": "Heuristic decision based on latency/error thresholds"
|
| 97 |
+
}
|
| 98 |
+
}
|
| 99 |
+
|
| 100 |
+
# Asynchronous usage logging
|
| 101 |
+
if tracker:
|
| 102 |
+
record = UsageRecord(
|
| 103 |
+
api_key=api_key,
|
| 104 |
+
tier=tier,
|
| 105 |
+
timestamp=time.time(),
|
| 106 |
+
endpoint="/v1/incidents/evaluate",
|
| 107 |
+
request_body=event.dict(),
|
| 108 |
+
response=response_data,
|
| 109 |
+
processing_ms=(time.time() - start_time) * 1000,
|
| 110 |
+
)
|
| 111 |
+
await tracker.increment_usage_async(record, background_tasks)
|
| 112 |
+
|
| 113 |
+
return response_data
|
| 114 |
+
|
| 115 |
+
except HTTPException:
|
| 116 |
+
raise
|
| 117 |
+
except Exception as e:
|
| 118 |
+
error_msg = str(e)
|
| 119 |
+
# Log failure in background
|
| 120 |
+
if tracker:
|
| 121 |
+
record = UsageRecord(
|
| 122 |
+
api_key=api_key,
|
| 123 |
+
tier=tier,
|
| 124 |
+
timestamp=time.time(),
|
| 125 |
+
endpoint="/v1/incidents/evaluate",
|
| 126 |
+
request_body=event.dict(),
|
| 127 |
+
error=error_msg,
|
| 128 |
+
processing_ms=(time.time() - start_time) * 1000,
|
| 129 |
+
)
|
| 130 |
+
await tracker.increment_usage_async(record, background_tasks)
|
| 131 |
+
raise HTTPException(status_code=500, detail=error_msg)
|
app/api/routes_intents.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import APIRouter, HTTPException
|
| 2 |
+
from app.models.intent_models import IntentSimulation, IntentSimulationResponse
|
| 3 |
+
from app.services.intent_service import simulate_intent
|
| 4 |
+
|
| 5 |
+
router = APIRouter()
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
@router.post("/simulate_intent", response_model=IntentSimulationResponse)
|
| 9 |
+
async def simulate_intent_endpoint(intent: IntentSimulation):
|
| 10 |
+
try:
|
| 11 |
+
result = simulate_intent(intent)
|
| 12 |
+
return IntentSimulationResponse(**result)
|
| 13 |
+
except Exception as e:
|
| 14 |
+
raise HTTPException(status_code=500, detail=str(e))
|
app/api/routes_memory.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import APIRouter, Request
|
| 2 |
+
|
| 3 |
+
router = APIRouter()
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
@router.get("/stats")
|
| 7 |
+
async def memory_stats(request: Request):
|
| 8 |
+
"""
|
| 9 |
+
Return current memory graph statistics using the risk engine's memory instance.
|
| 10 |
+
"""
|
| 11 |
+
risk_engine = request.app.state.risk_engine
|
| 12 |
+
|
| 13 |
+
# Check if memory exists and has the required method
|
| 14 |
+
if hasattr(risk_engine, 'memory') and hasattr(risk_engine.memory, 'get_graph_stats'):
|
| 15 |
+
stats = risk_engine.memory.get_graph_stats()
|
| 16 |
+
return stats
|
| 17 |
+
else:
|
| 18 |
+
# Graceful fallback (e.g., during testing or if memory not initialized)
|
| 19 |
+
return {
|
| 20 |
+
"incident_nodes": 0,
|
| 21 |
+
"outcome_nodes": 0,
|
| 22 |
+
"edges": 0,
|
| 23 |
+
"message": "Memory not fully initialized"
|
| 24 |
+
}
|
app/api/routes_payments.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Payment endpoints – Stripe Checkout integration.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import os
|
| 6 |
+
import stripe
|
| 7 |
+
from fastapi import APIRouter, HTTPException, Request
|
| 8 |
+
from pydantic import BaseModel
|
| 9 |
+
from typing import Optional
|
| 10 |
+
|
| 11 |
+
from app.core.config import settings
|
| 12 |
+
from app.core.usage_tracker import tracker, Tier
|
| 13 |
+
|
| 14 |
+
router = APIRouter(prefix="/payments", tags=["payments"])
|
| 15 |
+
|
| 16 |
+
# Set Stripe API key (from environment)
|
| 17 |
+
stripe.api_key = os.getenv("STRIPE_SECRET_KEY")
|
| 18 |
+
STRIPE_WEBHOOK_SECRET = os.getenv("STRIPE_WEBHOOK_SECRET")
|
| 19 |
+
|
| 20 |
+
class CheckoutRequest(BaseModel):
|
| 21 |
+
api_key: str
|
| 22 |
+
success_url: str
|
| 23 |
+
cancel_url: str
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
@router.post("/create-checkout-session")
|
| 27 |
+
async def create_checkout_session(req: CheckoutRequest):
|
| 28 |
+
"""Create a Stripe Checkout session for the Pro tier."""
|
| 29 |
+
if not stripe.api_key:
|
| 30 |
+
raise HTTPException(status_code=500, detail="Stripe not configured")
|
| 31 |
+
|
| 32 |
+
# Verify the API key exists and is free tier
|
| 33 |
+
tier = tracker.get_tier(req.api_key) if tracker else None
|
| 34 |
+
if tier != Tier.FREE:
|
| 35 |
+
raise HTTPException(status_code=400, detail="Only free tier keys can be upgraded")
|
| 36 |
+
|
| 37 |
+
try:
|
| 38 |
+
checkout_session = stripe.checkout.Session.create(
|
| 39 |
+
payment_method_types=["card"],
|
| 40 |
+
line_items=[
|
| 41 |
+
{
|
| 42 |
+
"price": os.getenv("STRIPE_PRO_PRICE_ID"), # e.g., "price_123"
|
| 43 |
+
"quantity": 1,
|
| 44 |
+
}
|
| 45 |
+
],
|
| 46 |
+
mode="subscription",
|
| 47 |
+
success_url=req.success_url,
|
| 48 |
+
cancel_url=req.cancel_url,
|
| 49 |
+
metadata={"api_key": req.api_key},
|
| 50 |
+
client_reference_id=req.api_key,
|
| 51 |
+
)
|
| 52 |
+
return {"sessionId": checkout_session.id, "url": checkout_session.url}
|
| 53 |
+
except Exception as e:
|
| 54 |
+
raise HTTPException(status_code=500, detail=str(e))
|
app/api/routes_risk.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import APIRouter, HTTPException
|
| 2 |
+
from app.models.risk_models import RiskResponse
|
| 3 |
+
from app.services.risk_service import get_system_risk
|
| 4 |
+
|
| 5 |
+
router = APIRouter()
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
@router.get("/get_risk", response_model=RiskResponse)
|
| 9 |
+
async def get_risk():
|
| 10 |
+
try:
|
| 11 |
+
risk = get_system_risk()
|
| 12 |
+
if risk < 0.3:
|
| 13 |
+
status = "low"
|
| 14 |
+
elif risk < 0.6:
|
| 15 |
+
status = "moderate"
|
| 16 |
+
elif risk < 0.8:
|
| 17 |
+
status = "high"
|
| 18 |
+
else:
|
| 19 |
+
status = "critical"
|
| 20 |
+
return RiskResponse(system_risk=risk, status=status)
|
| 21 |
+
except Exception as e:
|
| 22 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
@router.get("/history")
|
| 26 |
+
async def get_risk_history():
|
| 27 |
+
"""
|
| 28 |
+
Return dummy historical risk data for the last 24 hours.
|
| 29 |
+
Replace with real database query later.
|
| 30 |
+
"""
|
| 31 |
+
import random
|
| 32 |
+
import datetime
|
| 33 |
+
now = datetime.datetime.now()
|
| 34 |
+
data = []
|
| 35 |
+
for i in range(24, 0, -1):
|
| 36 |
+
data.append({
|
| 37 |
+
"time": (now - datetime.timedelta(hours=i)).isoformat(),
|
| 38 |
+
"risk": round(random.uniform(0.2, 0.8), 2)
|
| 39 |
+
})
|
| 40 |
+
return data
|
app/api/routes_users.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
User endpoints – registration and quota information.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import uuid
|
| 6 |
+
from fastapi import APIRouter, Depends, HTTPException, Request
|
| 7 |
+
from slowapi import Limiter
|
| 8 |
+
from slowapi.util import get_remote_address
|
| 9 |
+
from app.core.usage_tracker import tracker, enforce_quota, Tier
|
| 10 |
+
|
| 11 |
+
router = APIRouter(prefix="/users", tags=["users"])
|
| 12 |
+
|
| 13 |
+
# Rate limiter for registration (5 per hour per IP)
|
| 14 |
+
limiter = Limiter(key_func=get_remote_address, default_limits=["5/hour"])
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
@router.post("/register")
|
| 18 |
+
@limiter.limit("5/hour")
|
| 19 |
+
async def register_user(request: Request):
|
| 20 |
+
"""
|
| 21 |
+
Public endpoint to create a new free‑tier API key.
|
| 22 |
+
Rate‑limited to 5 requests per hour per IP address.
|
| 23 |
+
"""
|
| 24 |
+
if tracker is None:
|
| 25 |
+
raise HTTPException(status_code=503, detail="Usage tracking not available")
|
| 26 |
+
|
| 27 |
+
# Generate a new API key
|
| 28 |
+
new_key = f"sk_free_{uuid.uuid4().hex[:24]}"
|
| 29 |
+
|
| 30 |
+
# Store it as FREE tier
|
| 31 |
+
success = tracker.get_or_create_api_key(new_key, Tier.FREE)
|
| 32 |
+
if not success:
|
| 33 |
+
raise HTTPException(status_code=500, detail="Failed to create API key")
|
| 34 |
+
|
| 35 |
+
return {
|
| 36 |
+
"api_key": new_key,
|
| 37 |
+
"tier": "free",
|
| 38 |
+
"message": "API key created. Store it securely – you won't see it again."
|
| 39 |
+
}
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
@router.get("/quota")
|
| 43 |
+
async def get_user_quota(request: Request, quota: dict = Depends(enforce_quota)):
|
| 44 |
+
"""
|
| 45 |
+
Return the current user's tier and remaining evaluation quota.
|
| 46 |
+
Requires API key in Authorization header.
|
| 47 |
+
"""
|
| 48 |
+
tier = quota["tier"]
|
| 49 |
+
remaining = quota["remaining"]
|
| 50 |
+
limit = tier.monthly_evaluation_limit if tier else None
|
| 51 |
+
|
| 52 |
+
return {
|
| 53 |
+
"tier": tier.value,
|
| 54 |
+
"remaining": remaining,
|
| 55 |
+
"limit": limit,
|
| 56 |
+
}
|
app/api/webhooks.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Stripe webhook handler – updates API key tier on subscription events.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import os
|
| 6 |
+
import stripe
|
| 7 |
+
from fastapi import APIRouter, Request, HTTPException
|
| 8 |
+
from app.core.usage_tracker import update_key_tier, Tier
|
| 9 |
+
|
| 10 |
+
router = APIRouter(prefix="/webhooks", tags=["webhooks"])
|
| 11 |
+
|
| 12 |
+
STRIPE_WEBHOOK_SECRET = os.getenv("STRIPE_WEBHOOK_SECRET")
|
| 13 |
+
stripe.api_key = os.getenv("STRIPE_SECRET_KEY")
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
@router.post("/stripe")
|
| 17 |
+
async def stripe_webhook(request: Request):
|
| 18 |
+
payload = await request.body()
|
| 19 |
+
sig_header = request.headers.get("stripe-signature")
|
| 20 |
+
|
| 21 |
+
if not STRIPE_WEBHOOK_SECRET or not stripe.api_key:
|
| 22 |
+
raise HTTPException(status_code=500, detail="Stripe not configured")
|
| 23 |
+
|
| 24 |
+
try:
|
| 25 |
+
event = stripe.Webhook.construct_event(
|
| 26 |
+
payload, sig_header, STRIPE_WEBHOOK_SECRET
|
| 27 |
+
)
|
| 28 |
+
except ValueError:
|
| 29 |
+
raise HTTPException(status_code=400, detail="Invalid payload")
|
| 30 |
+
except stripe.error.SignatureVerificationError:
|
| 31 |
+
raise HTTPException(status_code=400, detail="Invalid signature")
|
| 32 |
+
|
| 33 |
+
# Handle subscription events
|
| 34 |
+
if event["type"] == "checkout.session.completed":
|
| 35 |
+
session = event["data"]["object"]
|
| 36 |
+
api_key = session.get("client_reference_id") or session.get("metadata", {}).get("api_key")
|
| 37 |
+
if api_key:
|
| 38 |
+
update_key_tier(api_key, Tier.PRO)
|
| 39 |
+
elif event["type"] == "customer.subscription.deleted":
|
| 40 |
+
subscription = event["data"]["object"]
|
| 41 |
+
# You need to store a mapping from subscription ID to API key.
|
| 42 |
+
# For simplicity, we assume you stored it in metadata during checkout.
|
| 43 |
+
# Alternatively, look up by customer ID.
|
| 44 |
+
api_key = subscription.get("metadata", {}).get("api_key")
|
| 45 |
+
if api_key:
|
| 46 |
+
update_key_tier(api_key, Tier.FREE)
|
| 47 |
+
|
| 48 |
+
return {"status": "ok"}
|
app/causal_explainer.py
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Causal reasoning module for ARF – OSS Edition.
|
| 3 |
+
Provides counterfactual explanations using deterministic heuristics.
|
| 4 |
+
No external causal libraries required.
|
| 5 |
+
"""
|
| 6 |
+
from typing import Dict, Any, Optional, List, Tuple
|
| 7 |
+
from dataclasses import dataclass, field
|
| 8 |
+
import pandas as pd
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
@dataclass
|
| 12 |
+
class CausalExplanation:
|
| 13 |
+
factual_outcome: float
|
| 14 |
+
counterfactual_outcome: float
|
| 15 |
+
effect: float
|
| 16 |
+
is_model_based: bool
|
| 17 |
+
confidence_interval: Optional[Tuple[float, float]] = None
|
| 18 |
+
explanation_text: str = ""
|
| 19 |
+
warnings: List[str] = field(default_factory=list)
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class CausalExplainer:
|
| 23 |
+
"""
|
| 24 |
+
Heuristic causal explainer for healing actions.
|
| 25 |
+
Uses domain rules and correlation estimates to produce counterfactuals.
|
| 26 |
+
"""
|
| 27 |
+
|
| 28 |
+
def __init__(self, memory_store=None):
|
| 29 |
+
self.memory = memory_store
|
| 30 |
+
self.treatment = "healing_action" # symbolic name
|
| 31 |
+
self.outcome = "latency"
|
| 32 |
+
self._action_impact = {
|
| 33 |
+
"restart_container": {
|
| 34 |
+
"latency_effect": -0.15,
|
| 35 |
+
"error_rate_effect": -0.10},
|
| 36 |
+
"scale_out": {
|
| 37 |
+
"latency_effect": -0.20,
|
| 38 |
+
"error_rate_effect": -0.05},
|
| 39 |
+
"rollback": {
|
| 40 |
+
"latency_effect": -0.25,
|
| 41 |
+
"error_rate_effect": -0.20},
|
| 42 |
+
"circuit_breaker": {
|
| 43 |
+
"latency_effect": -0.05,
|
| 44 |
+
"error_rate_effect": -0.30},
|
| 45 |
+
"traffic_shift": {
|
| 46 |
+
"latency_effect": -0.10,
|
| 47 |
+
"error_rate_effect": -0.10},
|
| 48 |
+
"alert_team": {
|
| 49 |
+
"latency_effect": 0.0,
|
| 50 |
+
"error_rate_effect": 0.0},
|
| 51 |
+
"no_action": {
|
| 52 |
+
"latency_effect": 0.0,
|
| 53 |
+
"error_rate_effect": 0.0},
|
| 54 |
+
}
|
| 55 |
+
self._uncertainty = 0.1 # ±10% confidence interval
|
| 56 |
+
|
| 57 |
+
def _extract_action_intensity(self, action_dict: Dict[str, Any]) -> float:
|
| 58 |
+
action_type = action_dict.get("action_type", "no_action")
|
| 59 |
+
if action_type == "no_action":
|
| 60 |
+
return 0.0
|
| 61 |
+
intensity_map = {
|
| 62 |
+
"restart_container": 0.4,
|
| 63 |
+
"scale_out": 0.6,
|
| 64 |
+
"rollback": 0.8,
|
| 65 |
+
"circuit_breaker": 0.7,
|
| 66 |
+
"traffic_shift": 0.5,
|
| 67 |
+
"alert_team": 0.1,
|
| 68 |
+
}
|
| 69 |
+
return intensity_map.get(action_type, 0.0)
|
| 70 |
+
|
| 71 |
+
def _get_effect_for_action(
|
| 72 |
+
self, action_dict: Dict[str, Any], metric: str) -> float:
|
| 73 |
+
action_type = action_dict.get("action_type", "no_action")
|
| 74 |
+
impacts = self._action_impact.get(
|
| 75 |
+
action_type, self._action_impact["no_action"])
|
| 76 |
+
if metric == "latency":
|
| 77 |
+
return impacts["latency_effect"]
|
| 78 |
+
elif metric == "error_rate":
|
| 79 |
+
return impacts["error_rate_effect"]
|
| 80 |
+
return 0.0
|
| 81 |
+
|
| 82 |
+
def counterfactual_explanation(
|
| 83 |
+
self,
|
| 84 |
+
observed_context: Dict[str, Any],
|
| 85 |
+
alternative_action: Dict[str, Any],
|
| 86 |
+
outcome_name: str = "latency",
|
| 87 |
+
confidence_level: float = 0.95
|
| 88 |
+
) -> CausalExplanation:
|
| 89 |
+
factual_outcome = observed_context.get(outcome_name, 0.0)
|
| 90 |
+
factual_action = observed_context.get("action_taken", {})
|
| 91 |
+
factual_intensity = self._extract_action_intensity(factual_action)
|
| 92 |
+
alt_intensity = self._extract_action_intensity(alternative_action)
|
| 93 |
+
|
| 94 |
+
effect_frac = self._get_effect_for_action(
|
| 95 |
+
alternative_action, outcome_name)
|
| 96 |
+
if alt_intensity == 0.0 and factual_intensity > 0.0:
|
| 97 |
+
factual_effect = self._get_effect_for_action(
|
| 98 |
+
factual_action, outcome_name)
|
| 99 |
+
effect_frac = -factual_effect
|
| 100 |
+
|
| 101 |
+
counterfactual = factual_outcome * (1.0 + effect_frac)
|
| 102 |
+
counterfactual = max(0.0, counterfactual)
|
| 103 |
+
effect = counterfactual - factual_outcome
|
| 104 |
+
ci_half = abs(effect) * self._uncertainty
|
| 105 |
+
confidence_interval = (
|
| 106 |
+
counterfactual - ci_half,
|
| 107 |
+
counterfactual + ci_half)
|
| 108 |
+
|
| 109 |
+
explanation_text = (
|
| 110 |
+
f"If we apply {
|
| 111 |
+
alternative_action.get(
|
| 112 |
+
'action_type',
|
| 113 |
+
'unknown')} instead of " f"{
|
| 114 |
+
factual_action.get(
|
| 115 |
+
'action_type',
|
| 116 |
+
'no action')}, {outcome_name} would change " f"from {
|
| 117 |
+
factual_outcome:.2f} to {
|
| 118 |
+
counterfactual:.2f} (Δ = {
|
| 119 |
+
effect:.2f}). " f"Based on heuristic causal model.")
|
| 120 |
+
|
| 121 |
+
return CausalExplanation(
|
| 122 |
+
factual_outcome=factual_outcome,
|
| 123 |
+
counterfactual_outcome=counterfactual,
|
| 124 |
+
effect=effect,
|
| 125 |
+
is_model_based=False,
|
| 126 |
+
confidence_interval=confidence_interval,
|
| 127 |
+
explanation_text=explanation_text,
|
| 128 |
+
warnings=["Using heuristic causal model (no fitted SCM)."]
|
| 129 |
+
)
|
| 130 |
+
|
| 131 |
+
def explain_healing_intent(
|
| 132 |
+
self,
|
| 133 |
+
proposed_action: Dict[str, Any],
|
| 134 |
+
current_state: Dict[str, Any],
|
| 135 |
+
outcome_metric: str = "latency"
|
| 136 |
+
) -> CausalExplanation:
|
| 137 |
+
observed = {
|
| 138 |
+
outcome_metric: current_state.get(
|
| 139 |
+
outcome_metric, 0.0), "action_taken": current_state.get(
|
| 140 |
+
"last_action", {
|
| 141 |
+
"action_type": "no_action"}), **current_state}
|
| 142 |
+
return self.counterfactual_explanation(
|
| 143 |
+
observed_context=observed,
|
| 144 |
+
alternative_action=proposed_action,
|
| 145 |
+
outcome_name=outcome_metric
|
| 146 |
+
)
|
| 147 |
+
|
| 148 |
+
def discover_graph_from_memory(
|
| 149 |
+
self, data: pd.DataFrame, method: str = "pc") -> Dict[str, Any]:
|
| 150 |
+
return {"nodes": list(data.columns), "edges": []}
|
| 151 |
+
|
| 152 |
+
def fit_scm(
|
| 153 |
+
self,
|
| 154 |
+
data: pd.DataFrame,
|
| 155 |
+
treatment: str,
|
| 156 |
+
outcome: str,
|
| 157 |
+
graph: Optional[Dict] = None):
|
| 158 |
+
self.treatment = treatment
|
| 159 |
+
self.outcome = outcome
|
| 160 |
+
|
| 161 |
+
def estimate_effect(
|
| 162 |
+
self,
|
| 163 |
+
method_name: str = "backdoor.linear_regression") -> Optional[float]:
|
| 164 |
+
return None
|
app/core/__init__.py
ADDED
|
File without changes
|
app/core/config.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pydantic_settings import BaseSettings
|
| 2 |
+
from typing import Optional
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
class Settings(BaseSettings):
|
| 6 |
+
app_name: str = "ARF API Control Plane"
|
| 7 |
+
environment: str = "development"
|
| 8 |
+
database_url: Optional[str] = None
|
| 9 |
+
api_key: Optional[str] = None
|
| 10 |
+
RATE_LIMIT: str = "100/minute" # default limit
|
| 11 |
+
|
| 12 |
+
# Usage tracker settings
|
| 13 |
+
ARF_USAGE_TRACKING: bool = False
|
| 14 |
+
ARF_USAGE_DB_PATH: str = "arf_usage.db"
|
| 15 |
+
ARF_REDIS_URL: Optional[str] = None
|
| 16 |
+
ARF_API_KEYS: str = "{}" # JSON string of {key: tier}
|
| 17 |
+
|
| 18 |
+
class Config:
|
| 19 |
+
env_file = ".env"
|
| 20 |
+
extra = "ignore"
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
settings = Settings()
|
app/core/storage.py
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Simple in-memory list for incident history
|
| 2 |
+
incident_history = []
|
app/core/usage_tracker.py
ADDED
|
@@ -0,0 +1,347 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Usage Tracker for ARF API – quotas, tiers, and audit logging.
|
| 3 |
+
Non‑invasive, configurable, thread‑safe, and background‑task ready.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
import json
|
| 8 |
+
import sqlite3
|
| 9 |
+
import threading
|
| 10 |
+
import time
|
| 11 |
+
from contextlib import contextmanager
|
| 12 |
+
from datetime import datetime, timedelta
|
| 13 |
+
from typing import Dict, Any, Optional, List
|
| 14 |
+
from enum import Enum
|
| 15 |
+
from dataclasses import dataclass
|
| 16 |
+
from fastapi import BackgroundTasks
|
| 17 |
+
|
| 18 |
+
# Optional Redis support
|
| 19 |
+
try:
|
| 20 |
+
import redis
|
| 21 |
+
REDIS_AVAILABLE = True
|
| 22 |
+
except ImportError:
|
| 23 |
+
REDIS_AVAILABLE = False
|
| 24 |
+
redis = None
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
class Tier(str, Enum):
|
| 28 |
+
FREE = "free"
|
| 29 |
+
PRO = "pro"
|
| 30 |
+
PREMIUM = "premium"
|
| 31 |
+
ENTERPRISE = "enterprise"
|
| 32 |
+
|
| 33 |
+
@property
|
| 34 |
+
def monthly_evaluation_limit(self) -> Optional[int]:
|
| 35 |
+
limits = {
|
| 36 |
+
Tier.FREE: 1000,
|
| 37 |
+
Tier.PRO: 10_000,
|
| 38 |
+
Tier.PREMIUM: 50_000,
|
| 39 |
+
Tier.ENTERPRISE: None, # unlimited
|
| 40 |
+
}
|
| 41 |
+
return limits[self]
|
| 42 |
+
|
| 43 |
+
@property
|
| 44 |
+
def audit_log_retention_days(self) -> int:
|
| 45 |
+
retention = {
|
| 46 |
+
Tier.FREE: 7,
|
| 47 |
+
Tier.PRO: 30,
|
| 48 |
+
Tier.PREMIUM: 90,
|
| 49 |
+
Tier.ENTERPRISE: 365,
|
| 50 |
+
}
|
| 51 |
+
return retention[self]
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
@dataclass
|
| 55 |
+
class UsageRecord:
|
| 56 |
+
"""Single evaluation usage record."""
|
| 57 |
+
api_key: str
|
| 58 |
+
tier: Tier
|
| 59 |
+
timestamp: float
|
| 60 |
+
endpoint: str
|
| 61 |
+
request_body: Optional[Dict[str, Any]] = None
|
| 62 |
+
response: Optional[Dict[str, Any]] = None
|
| 63 |
+
error: Optional[str] = None
|
| 64 |
+
processing_ms: Optional[float] = None
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
class UsageTracker:
|
| 68 |
+
"""
|
| 69 |
+
Thread‑safe usage tracker with SQLite storage and optional Redis for counters.
|
| 70 |
+
"""
|
| 71 |
+
|
| 72 |
+
def __init__(self, db_path: str = "arf_usage.db", redis_url: Optional[str] = None):
|
| 73 |
+
self.db_path = db_path
|
| 74 |
+
self._local = threading.local()
|
| 75 |
+
self._init_db()
|
| 76 |
+
|
| 77 |
+
self._redis_client = None
|
| 78 |
+
if redis_url and REDIS_AVAILABLE:
|
| 79 |
+
self._redis_client = redis.from_url(redis_url)
|
| 80 |
+
elif redis_url:
|
| 81 |
+
raise ImportError("Redis client not installed. Run: pip install redis")
|
| 82 |
+
|
| 83 |
+
@contextmanager
|
| 84 |
+
def _get_conn(self):
|
| 85 |
+
"""Get a thread‑local SQLite connection."""
|
| 86 |
+
if not hasattr(self._local, "conn"):
|
| 87 |
+
self._local.conn = sqlite3.connect(self.db_path, check_same_thread=False)
|
| 88 |
+
self._local.conn.row_factory = sqlite3.Row
|
| 89 |
+
yield self._local.conn
|
| 90 |
+
|
| 91 |
+
def _init_db(self):
|
| 92 |
+
with self._get_conn() as conn:
|
| 93 |
+
conn.execute("""
|
| 94 |
+
CREATE TABLE IF NOT EXISTS api_keys (
|
| 95 |
+
key TEXT PRIMARY KEY,
|
| 96 |
+
tier TEXT NOT NULL,
|
| 97 |
+
created_at REAL NOT NULL,
|
| 98 |
+
last_used_at REAL,
|
| 99 |
+
is_active INTEGER DEFAULT 1
|
| 100 |
+
)
|
| 101 |
+
""")
|
| 102 |
+
conn.execute("""
|
| 103 |
+
CREATE TABLE IF NOT EXISTS usage_log (
|
| 104 |
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
| 105 |
+
api_key TEXT NOT NULL,
|
| 106 |
+
tier TEXT NOT NULL,
|
| 107 |
+
timestamp REAL NOT NULL,
|
| 108 |
+
endpoint TEXT NOT NULL,
|
| 109 |
+
request_body TEXT,
|
| 110 |
+
response TEXT,
|
| 111 |
+
error TEXT,
|
| 112 |
+
processing_ms REAL
|
| 113 |
+
)
|
| 114 |
+
""")
|
| 115 |
+
conn.execute("""
|
| 116 |
+
CREATE INDEX IF NOT EXISTS idx_api_key_timestamp
|
| 117 |
+
ON usage_log (api_key, timestamp)
|
| 118 |
+
""")
|
| 119 |
+
conn.execute("""
|
| 120 |
+
CREATE TABLE IF NOT EXISTS monthly_counts (
|
| 121 |
+
api_key TEXT NOT NULL,
|
| 122 |
+
year_month TEXT NOT NULL,
|
| 123 |
+
count INTEGER DEFAULT 0,
|
| 124 |
+
PRIMARY KEY (api_key, year_month)
|
| 125 |
+
)
|
| 126 |
+
""")
|
| 127 |
+
conn.commit()
|
| 128 |
+
|
| 129 |
+
def _get_month_key(self) -> str:
|
| 130 |
+
return datetime.now().strftime("%Y-%m")
|
| 131 |
+
|
| 132 |
+
def get_or_create_api_key(self, key: str, tier: Tier = Tier.FREE) -> bool:
|
| 133 |
+
"""Register a new API key. Returns True if key exists or was created."""
|
| 134 |
+
with self._get_conn() as conn:
|
| 135 |
+
row = conn.execute("SELECT key FROM api_keys WHERE key = ?", (key,)).fetchone()
|
| 136 |
+
if row:
|
| 137 |
+
return True
|
| 138 |
+
conn.execute(
|
| 139 |
+
"INSERT INTO api_keys (key, tier, created_at, is_active) VALUES (?, ?, ?, ?)",
|
| 140 |
+
(key, tier.value, time.time(), 1)
|
| 141 |
+
)
|
| 142 |
+
conn.commit()
|
| 143 |
+
return True
|
| 144 |
+
|
| 145 |
+
def get_tier(self, api_key: str) -> Optional[Tier]:
|
| 146 |
+
"""Return the tier for a given API key, or None if key invalid/inactive."""
|
| 147 |
+
with self._get_conn() as conn:
|
| 148 |
+
row = conn.execute(
|
| 149 |
+
"SELECT tier FROM api_keys WHERE key = ? AND is_active = 1",
|
| 150 |
+
(api_key,)
|
| 151 |
+
).fetchone()
|
| 152 |
+
if not row:
|
| 153 |
+
return None
|
| 154 |
+
return Tier(row["tier"])
|
| 155 |
+
|
| 156 |
+
def update_api_key_tier(self, api_key: str, new_tier: Tier) -> bool:
|
| 157 |
+
"""Update the tier of an existing API key. Returns True if successful."""
|
| 158 |
+
with self._get_conn() as conn:
|
| 159 |
+
row = conn.execute("SELECT key FROM api_keys WHERE key = ?", (api_key,)).fetchone()
|
| 160 |
+
if not row:
|
| 161 |
+
return False
|
| 162 |
+
conn.execute("UPDATE api_keys SET tier = ? WHERE key = ?", (new_tier.value, api_key))
|
| 163 |
+
conn.commit()
|
| 164 |
+
return True
|
| 165 |
+
|
| 166 |
+
def get_remaining_quota(self, api_key: str, tier: Tier) -> Optional[int]:
|
| 167 |
+
"""Return remaining evaluations for the month, or None if unlimited."""
|
| 168 |
+
limit = tier.monthly_evaluation_limit
|
| 169 |
+
if limit is None:
|
| 170 |
+
return None
|
| 171 |
+
|
| 172 |
+
month = self._get_month_key()
|
| 173 |
+
if self._redis_client:
|
| 174 |
+
redis_key = f"arf:quota:{api_key}:{month}"
|
| 175 |
+
count = int(self._redis_client.get(redis_key) or 0)
|
| 176 |
+
return max(0, limit - count)
|
| 177 |
+
|
| 178 |
+
with self._get_conn() as conn:
|
| 179 |
+
row = conn.execute(
|
| 180 |
+
"SELECT count FROM monthly_counts WHERE api_key = ? AND year_month = ?",
|
| 181 |
+
(api_key, month)
|
| 182 |
+
).fetchone()
|
| 183 |
+
count = row["count"] if row else 0
|
| 184 |
+
return max(0, limit - count)
|
| 185 |
+
|
| 186 |
+
def _increment_quota(self, api_key: str, tier: Tier) -> None:
|
| 187 |
+
"""Increment the monthly counter (internal, synchronous)."""
|
| 188 |
+
limit = tier.monthly_evaluation_limit
|
| 189 |
+
if limit is None:
|
| 190 |
+
return
|
| 191 |
+
month = self._get_month_key()
|
| 192 |
+
if self._redis_client:
|
| 193 |
+
redis_key = f"arf:quota:{api_key}:{month}"
|
| 194 |
+
self._redis_client.incr(redis_key)
|
| 195 |
+
self._redis_client.expire(redis_key, timedelta(days=31))
|
| 196 |
+
else:
|
| 197 |
+
with self._get_conn() as conn:
|
| 198 |
+
conn.execute(
|
| 199 |
+
"""INSERT INTO monthly_counts (api_key, year_month, count)
|
| 200 |
+
VALUES (?, ?, 1)
|
| 201 |
+
ON CONFLICT(api_key, year_month) DO UPDATE SET count = count + 1""",
|
| 202 |
+
(api_key, month)
|
| 203 |
+
)
|
| 204 |
+
conn.commit()
|
| 205 |
+
|
| 206 |
+
def _insert_audit_log(self, record: UsageRecord) -> None:
|
| 207 |
+
"""Insert a single audit log (internal, synchronous)."""
|
| 208 |
+
with self._get_conn() as conn:
|
| 209 |
+
conn.execute(
|
| 210 |
+
"""INSERT INTO usage_log
|
| 211 |
+
(api_key, tier, timestamp, endpoint, request_body, response, error, processing_ms)
|
| 212 |
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?)""",
|
| 213 |
+
(
|
| 214 |
+
record.api_key,
|
| 215 |
+
record.tier.value,
|
| 216 |
+
record.timestamp,
|
| 217 |
+
record.endpoint,
|
| 218 |
+
json.dumps(record.request_body) if record.request_body else None,
|
| 219 |
+
json.dumps(record.response) if record.response else None,
|
| 220 |
+
record.error,
|
| 221 |
+
record.processing_ms,
|
| 222 |
+
)
|
| 223 |
+
)
|
| 224 |
+
conn.commit()
|
| 225 |
+
|
| 226 |
+
def increment_usage_sync(self, record: UsageRecord) -> bool:
|
| 227 |
+
"""
|
| 228 |
+
Synchronously record usage and increment counter.
|
| 229 |
+
Returns True if within quota (i.e., counter was incremented), False if quota exceeded.
|
| 230 |
+
"""
|
| 231 |
+
tier = record.tier
|
| 232 |
+
limit = tier.monthly_evaluation_limit
|
| 233 |
+
if limit is not None:
|
| 234 |
+
remaining = self.get_remaining_quota(record.api_key, tier)
|
| 235 |
+
if remaining <= 0:
|
| 236 |
+
return False
|
| 237 |
+
self._increment_quota(record.api_key, tier)
|
| 238 |
+
self._insert_audit_log(record)
|
| 239 |
+
return True
|
| 240 |
+
|
| 241 |
+
async def increment_usage_async(self, record: UsageRecord, background_tasks: BackgroundTasks) -> bool:
|
| 242 |
+
"""
|
| 243 |
+
Asynchronously record usage using FastAPI BackgroundTasks.
|
| 244 |
+
Returns True if quota allows (i.e., will be recorded), False if quota exceeded.
|
| 245 |
+
"""
|
| 246 |
+
tier = record.tier
|
| 247 |
+
limit = tier.monthly_evaluation_limit
|
| 248 |
+
if limit is not None:
|
| 249 |
+
remaining = self.get_remaining_quota(record.api_key, tier)
|
| 250 |
+
if remaining <= 0:
|
| 251 |
+
return False
|
| 252 |
+
# Schedule the actual write in the background
|
| 253 |
+
background_tasks.add_task(self._increment_quota, record.api_key, tier)
|
| 254 |
+
background_tasks.add_task(self._insert_audit_log, record)
|
| 255 |
+
return True
|
| 256 |
+
|
| 257 |
+
def get_audit_logs(
|
| 258 |
+
self,
|
| 259 |
+
api_key: str,
|
| 260 |
+
start_date: Optional[datetime] = None,
|
| 261 |
+
end_date: Optional[datetime] = None,
|
| 262 |
+
limit: int = 100,
|
| 263 |
+
) -> List[Dict[str, Any]]:
|
| 264 |
+
"""Retrieve audit logs for a given API key."""
|
| 265 |
+
query = "SELECT * FROM usage_log WHERE api_key = ?"
|
| 266 |
+
params = [api_key]
|
| 267 |
+
if start_date:
|
| 268 |
+
query += " AND timestamp >= ?"
|
| 269 |
+
params.append(start_date.timestamp())
|
| 270 |
+
if end_date:
|
| 271 |
+
query += " AND timestamp <= ?"
|
| 272 |
+
params.append(end_date.timestamp())
|
| 273 |
+
query += " ORDER BY timestamp DESC LIMIT ?"
|
| 274 |
+
params.append(limit)
|
| 275 |
+
|
| 276 |
+
with self._get_conn() as conn:
|
| 277 |
+
rows = conn.execute(query, params).fetchall()
|
| 278 |
+
return [dict(row) for row in rows]
|
| 279 |
+
|
| 280 |
+
def clean_old_logs(self):
|
| 281 |
+
"""Delete logs older than retention period for each tier."""
|
| 282 |
+
with self._get_conn() as conn:
|
| 283 |
+
for tier in Tier:
|
| 284 |
+
retention_days = tier.audit_log_retention_days
|
| 285 |
+
if retention_days is None:
|
| 286 |
+
continue
|
| 287 |
+
cutoff = time.time() - retention_days * 86400
|
| 288 |
+
conn.execute(
|
| 289 |
+
"DELETE FROM usage_log WHERE tier = ? AND timestamp < ?",
|
| 290 |
+
(tier.value, cutoff)
|
| 291 |
+
)
|
| 292 |
+
conn.commit()
|
| 293 |
+
|
| 294 |
+
|
| 295 |
+
# Global instance
|
| 296 |
+
tracker: Optional[UsageTracker] = None
|
| 297 |
+
|
| 298 |
+
|
| 299 |
+
def init_tracker(db_path: str = "arf_usage.db", redis_url: Optional[str] = None):
|
| 300 |
+
global tracker
|
| 301 |
+
tracker = UsageTracker(db_path, redis_url)
|
| 302 |
+
|
| 303 |
+
|
| 304 |
+
def update_key_tier(api_key: str, new_tier: Tier) -> bool:
|
| 305 |
+
"""Globally accessible helper to update API key tier."""
|
| 306 |
+
if tracker is None:
|
| 307 |
+
return False
|
| 308 |
+
return tracker.update_api_key_tier(api_key, new_tier)
|
| 309 |
+
|
| 310 |
+
|
| 311 |
+
# FastAPI dependency to enforce quota
|
| 312 |
+
from fastapi import HTTPException, Request
|
| 313 |
+
|
| 314 |
+
async def enforce_quota(request: Request, api_key: str = None):
|
| 315 |
+
"""
|
| 316 |
+
Dependency that checks API key and remaining quota.
|
| 317 |
+
Use in your endpoint: `quota = Depends(enforce_quota)`
|
| 318 |
+
|
| 319 |
+
If usage tracking is disabled, returns a default dict (no enforcement).
|
| 320 |
+
"""
|
| 321 |
+
# If tracker not initialised, allow all requests (fallback)
|
| 322 |
+
if tracker is None:
|
| 323 |
+
return {"api_key": api_key or "disabled", "tier": Tier.FREE, "remaining": None}
|
| 324 |
+
|
| 325 |
+
# Extract API key from header or query
|
| 326 |
+
if api_key is None:
|
| 327 |
+
auth_header = request.headers.get("Authorization")
|
| 328 |
+
if auth_header and auth_header.startswith("Bearer "):
|
| 329 |
+
api_key = auth_header[7:]
|
| 330 |
+
else:
|
| 331 |
+
api_key = request.query_params.get("api_key")
|
| 332 |
+
|
| 333 |
+
if not api_key:
|
| 334 |
+
raise HTTPException(status_code=401, detail="Missing API key")
|
| 335 |
+
|
| 336 |
+
tier = tracker.get_tier(api_key)
|
| 337 |
+
if tier is None:
|
| 338 |
+
raise HTTPException(status_code=403, detail="Invalid or inactive API key")
|
| 339 |
+
|
| 340 |
+
remaining = tracker.get_remaining_quota(api_key, tier)
|
| 341 |
+
if remaining is not None and remaining <= 0:
|
| 342 |
+
raise HTTPException(status_code=429, detail="Monthly evaluation quota exceeded")
|
| 343 |
+
|
| 344 |
+
# Store in request state for later logging
|
| 345 |
+
request.state.api_key = api_key
|
| 346 |
+
request.state.tier = tier
|
| 347 |
+
return {"api_key": api_key, "tier": tier, "remaining": remaining}
|
app/database/__init__.py
ADDED
|
File without changes
|
app/database/base.py
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from sqlalchemy.ext.declarative import declarative_base
|
| 2 |
+
Base = declarative_base()
|
app/database/models_intents.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from sqlalchemy import Column, Integer, String, DateTime, Boolean, Text, JSON, ForeignKey, UniqueConstraint
|
| 2 |
+
from sqlalchemy.orm import relationship
|
| 3 |
+
import datetime
|
| 4 |
+
from .base import Base
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class IntentDB(Base):
|
| 8 |
+
__tablename__ = "intents"
|
| 9 |
+
id = Column(Integer, primary_key=True, index=True)
|
| 10 |
+
deterministic_id = Column(String(64), unique=True, index=True, nullable=False)
|
| 11 |
+
intent_type = Column(String(64), nullable=False)
|
| 12 |
+
payload = Column(JSON, nullable=False)
|
| 13 |
+
oss_payload = Column(JSON, nullable=True)
|
| 14 |
+
environment = Column(String(32), nullable=True)
|
| 15 |
+
created_at = Column(DateTime, default=datetime.datetime.utcnow, nullable=False)
|
| 16 |
+
evaluated_at = Column(DateTime, nullable=True)
|
| 17 |
+
risk_score = Column(String(32), nullable=True)
|
| 18 |
+
outcomes = relationship("OutcomeDB", back_populates="intent", cascade="all, delete-orphan")
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class OutcomeDB(Base):
|
| 22 |
+
__tablename__ = "intent_outcomes"
|
| 23 |
+
id = Column(Integer, primary_key=True, index=True)
|
| 24 |
+
intent_id = Column(Integer, ForeignKey("intents.id", ondelete="CASCADE"), nullable=False)
|
| 25 |
+
success = Column(Boolean, nullable=False)
|
| 26 |
+
recorded_by = Column(String(128), nullable=True)
|
| 27 |
+
notes = Column(Text, nullable=True)
|
| 28 |
+
recorded_at = Column(DateTime, default=datetime.datetime.utcnow, nullable=False)
|
| 29 |
+
intent = relationship("IntentDB", back_populates="outcomes")
|
| 30 |
+
|
| 31 |
+
__table_args__ = (
|
| 32 |
+
UniqueConstraint("intent_id", name="uq_outcome_intentid"),
|
| 33 |
+
)
|
app/database/session.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from sqlalchemy import create_engine
|
| 2 |
+
from sqlalchemy.orm import sessionmaker
|
| 3 |
+
from app.core.config import settings
|
| 4 |
+
|
| 5 |
+
engine = create_engine(settings.database_url)
|
| 6 |
+
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
|
app/main.py
ADDED
|
@@ -0,0 +1,257 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
ARF API Control Plane - Main Application Entry Point
|
| 3 |
+
With optional heavy dependencies and usage tracking.
|
| 4 |
+
"""
|
| 5 |
+
import logging
|
| 6 |
+
import os
|
| 7 |
+
import sys
|
| 8 |
+
import json
|
| 9 |
+
from contextlib import asynccontextmanager
|
| 10 |
+
from typing import Dict
|
| 11 |
+
|
| 12 |
+
from fastapi import FastAPI
|
| 13 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 14 |
+
|
| 15 |
+
# Optional prometheus
|
| 16 |
+
try:
|
| 17 |
+
from prometheus_fastapi_instrumentator import Instrumentator
|
| 18 |
+
PROMETHEUS_AVAILABLE = True
|
| 19 |
+
except ImportError:
|
| 20 |
+
PROMETHEUS_AVAILABLE = False
|
| 21 |
+
Instrumentator = None
|
| 22 |
+
|
| 23 |
+
# Optional slowapi
|
| 24 |
+
try:
|
| 25 |
+
from slowapi import _rate_limit_exceeded_handler
|
| 26 |
+
from slowapi.errors import RateLimitExceeded
|
| 27 |
+
from slowapi.middleware import SlowAPIMiddleware
|
| 28 |
+
SLOWAPI_AVAILABLE = True
|
| 29 |
+
except ImportError:
|
| 30 |
+
SLOWAPI_AVAILABLE = False
|
| 31 |
+
_rate_limit_exceeded_handler = None
|
| 32 |
+
RateLimitExceeded = None
|
| 33 |
+
SlowAPIMiddleware = None
|
| 34 |
+
|
| 35 |
+
# Optional agentic_reliability_framework (risk engine, policy engine, etc.)
|
| 36 |
+
try:
|
| 37 |
+
from agentic_reliability_framework.core.governance.risk_engine import RiskEngine
|
| 38 |
+
from agentic_reliability_framework.core.governance.policy_engine import PolicyEngine
|
| 39 |
+
from agentic_reliability_framework.runtime.memory import create_faiss_index, RAGGraphMemory
|
| 40 |
+
from agentic_reliability_framework.runtime.memory.constants import MemoryConstants
|
| 41 |
+
ARF_AVAILABLE = True
|
| 42 |
+
except ImportError:
|
| 43 |
+
ARF_AVAILABLE = False
|
| 44 |
+
RiskEngine = None
|
| 45 |
+
PolicyEngine = None
|
| 46 |
+
create_faiss_index = None
|
| 47 |
+
RAGGraphMemory = None
|
| 48 |
+
MemoryConstants = None
|
| 49 |
+
|
| 50 |
+
# ===== USAGE TRACKER =====
|
| 51 |
+
from app.core.usage_tracker import init_tracker, tracker, Tier
|
| 52 |
+
|
| 53 |
+
from app.api import (
|
| 54 |
+
routes_governance,
|
| 55 |
+
routes_history,
|
| 56 |
+
routes_incidents,
|
| 57 |
+
routes_intents,
|
| 58 |
+
routes_risk,
|
| 59 |
+
routes_memory,
|
| 60 |
+
routes_admin,
|
| 61 |
+
routes_payments,
|
| 62 |
+
webhooks,
|
| 63 |
+
routes_users, # <-- ADDED
|
| 64 |
+
)
|
| 65 |
+
from app.api.deps import limiter
|
| 66 |
+
from app.core.config import settings
|
| 67 |
+
|
| 68 |
+
logger = logging.getLogger("arf.api")
|
| 69 |
+
logging.basicConfig(
|
| 70 |
+
level=logging.INFO,
|
| 71 |
+
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
| 72 |
+
handlers=[logging.StreamHandler(sys.stdout)],
|
| 73 |
+
)
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
@asynccontextmanager
|
| 77 |
+
async def lifespan(app: FastAPI):
|
| 78 |
+
logger.info("🚀 Starting ARF API Control Plane")
|
| 79 |
+
logger.debug(f"Python path: {sys.path}")
|
| 80 |
+
|
| 81 |
+
if ARF_AVAILABLE:
|
| 82 |
+
hmc_model_path = os.getenv("ARF_HMC_MODEL", "models/hmc_model.json")
|
| 83 |
+
use_hyperpriors = os.getenv(
|
| 84 |
+
"ARF_USE_HYPERPRIORS",
|
| 85 |
+
"false").lower() == "true"
|
| 86 |
+
logger.info(
|
| 87 |
+
"Initializing RiskEngine – HMC model: %s, hyperpriors: %s",
|
| 88 |
+
hmc_model_path,
|
| 89 |
+
use_hyperpriors)
|
| 90 |
+
try:
|
| 91 |
+
app.state.risk_engine = RiskEngine(
|
| 92 |
+
hmc_model_path=hmc_model_path,
|
| 93 |
+
use_hyperpriors=use_hyperpriors,
|
| 94 |
+
n0=1000,
|
| 95 |
+
hyperprior_weight=0.3,
|
| 96 |
+
)
|
| 97 |
+
logger.info("✅ RiskEngine initialized successfully.")
|
| 98 |
+
except Exception as e:
|
| 99 |
+
logger.exception("💥 Fatal error initializing RiskEngine")
|
| 100 |
+
raise RuntimeError("RiskEngine initialization failed") from e
|
| 101 |
+
|
| 102 |
+
try:
|
| 103 |
+
app.state.policy_engine = PolicyEngine()
|
| 104 |
+
logger.info("✅ PolicyEngine initialized successfully.")
|
| 105 |
+
except Exception as e:
|
| 106 |
+
logger.warning(f"PolicyEngine initialization failed: {e}")
|
| 107 |
+
app.state.policy_engine = None
|
| 108 |
+
|
| 109 |
+
try:
|
| 110 |
+
faiss_index = create_faiss_index(dim=MemoryConstants.VECTOR_DIM)
|
| 111 |
+
app.state.rag_graph = RAGGraphMemory(faiss_index)
|
| 112 |
+
logger.info("✅ RAGGraphMemory initialized successfully.")
|
| 113 |
+
except Exception as e:
|
| 114 |
+
logger.warning(f"RAGGraphMemory initialization failed: {e}")
|
| 115 |
+
app.state.rag_graph = None
|
| 116 |
+
|
| 117 |
+
epistemic_model_name = os.getenv("EPISTEMIC_MODEL", "")
|
| 118 |
+
if epistemic_model_name:
|
| 119 |
+
try:
|
| 120 |
+
from sentence_transformers import SentenceTransformer
|
| 121 |
+
logger.info(f"Loading epistemic model: {epistemic_model_name}")
|
| 122 |
+
app.state.epistemic_model = SentenceTransformer(
|
| 123 |
+
epistemic_model_name)
|
| 124 |
+
app.state.epistemic_tokenizer = app.state.epistemic_model.tokenizer
|
| 125 |
+
logger.info("✅ Epistemic model loaded.")
|
| 126 |
+
except ImportError:
|
| 127 |
+
logger.warning(
|
| 128 |
+
"sentence-transformers not installed; epistemic signals will be zeros.")
|
| 129 |
+
app.state.epistemic_model = None
|
| 130 |
+
app.state.epistemic_tokenizer = None
|
| 131 |
+
except Exception as e:
|
| 132 |
+
logger.warning(f"Failed to load epistemic model: {e}")
|
| 133 |
+
app.state.epistemic_model = None
|
| 134 |
+
app.state.epistemic_tokenizer = None
|
| 135 |
+
else:
|
| 136 |
+
logger.info(
|
| 137 |
+
"EPISTEMIC_MODEL not set; epistemic signals will be zeros.")
|
| 138 |
+
app.state.epistemic_model = None
|
| 139 |
+
app.state.epistemic_tokenizer = None
|
| 140 |
+
else:
|
| 141 |
+
logger.warning(
|
| 142 |
+
"agentic_reliability_framework not installed; risk engine, policy engine, RAG disabled.")
|
| 143 |
+
|
| 144 |
+
# ===== USAGE TRACKER INITIALISATION =====
|
| 145 |
+
if os.getenv("ARF_USAGE_TRACKING", "false").lower() == "true":
|
| 146 |
+
logger.info("Initialising usage tracker...")
|
| 147 |
+
init_tracker(
|
| 148 |
+
db_path=os.getenv("ARF_USAGE_DB_PATH", "arf_usage.db"),
|
| 149 |
+
redis_url=os.getenv("ARF_REDIS_URL")
|
| 150 |
+
)
|
| 151 |
+
# Seed initial API keys from environment variable (for testing / demo)
|
| 152 |
+
api_keys_json = os.getenv("ARF_API_KEYS", "{}")
|
| 153 |
+
try:
|
| 154 |
+
api_keys = json.loads(api_keys_json)
|
| 155 |
+
for key, tier_str in api_keys.items():
|
| 156 |
+
try:
|
| 157 |
+
tier = Tier(tier_str.lower())
|
| 158 |
+
tracker.get_or_create_api_key(key, tier)
|
| 159 |
+
logger.info(f"Seeded API key for tier {tier.value}")
|
| 160 |
+
except ValueError:
|
| 161 |
+
logger.warning(f"Invalid tier '{tier_str}' for key {key}, skipping")
|
| 162 |
+
except json.JSONDecodeError:
|
| 163 |
+
logger.warning("ARF_API_KEYS environment variable is not valid JSON; skipping seeding.")
|
| 164 |
+
app.state.usage_tracker = tracker
|
| 165 |
+
logger.info("✅ Usage tracker ready.")
|
| 166 |
+
else:
|
| 167 |
+
logger.info("Usage tracking disabled (ARF_USAGE_TRACKING not set to true).")
|
| 168 |
+
app.state.usage_tracker = None
|
| 169 |
+
|
| 170 |
+
yield
|
| 171 |
+
logger.info("🛑 Shutting down ARF API")
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
def create_app() -> FastAPI:
|
| 175 |
+
app = FastAPI(
|
| 176 |
+
title=settings.app_name,
|
| 177 |
+
version="0.5.0",
|
| 178 |
+
lifespan=lifespan,
|
| 179 |
+
docs_url="/docs",
|
| 180 |
+
redoc_url="/redoc",
|
| 181 |
+
description="Agentic Reliability Framework (ARF) API",
|
| 182 |
+
)
|
| 183 |
+
|
| 184 |
+
allowed_origins = ["https://arf-frontend-sandy.vercel.app"]
|
| 185 |
+
app.add_middleware(
|
| 186 |
+
CORSMiddleware,
|
| 187 |
+
allow_origins=allowed_origins,
|
| 188 |
+
allow_credentials=True,
|
| 189 |
+
allow_methods=["*"],
|
| 190 |
+
allow_headers=["*"],
|
| 191 |
+
)
|
| 192 |
+
logger.debug("CORS middleware configured")
|
| 193 |
+
|
| 194 |
+
if SLOWAPI_AVAILABLE:
|
| 195 |
+
app.state.limiter = limiter
|
| 196 |
+
app.add_exception_handler(
|
| 197 |
+
RateLimitExceeded,
|
| 198 |
+
_rate_limit_exceeded_handler)
|
| 199 |
+
app.add_middleware(SlowAPIMiddleware)
|
| 200 |
+
logger.debug("Rate limiter middleware configured")
|
| 201 |
+
else:
|
| 202 |
+
logger.debug("Rate limiter disabled (slowapi not installed)")
|
| 203 |
+
|
| 204 |
+
if PROMETHEUS_AVAILABLE:
|
| 205 |
+
Instrumentator().instrument(app).expose(app)
|
| 206 |
+
logger.debug("Prometheus instrumentator configured")
|
| 207 |
+
else:
|
| 208 |
+
logger.debug(
|
| 209 |
+
"Prometheus instrumentator disabled (module not installed)")
|
| 210 |
+
|
| 211 |
+
# Include routers
|
| 212 |
+
app.include_router(
|
| 213 |
+
routes_incidents.router,
|
| 214 |
+
prefix="/api/v1",
|
| 215 |
+
tags=["incidents"])
|
| 216 |
+
app.include_router(routes_risk.router, prefix="/api/v1", tags=["risk"])
|
| 217 |
+
app.include_router(
|
| 218 |
+
routes_intents.router,
|
| 219 |
+
prefix="/api/v1",
|
| 220 |
+
tags=["intents"])
|
| 221 |
+
app.include_router(
|
| 222 |
+
routes_history.router,
|
| 223 |
+
prefix="/api/v1",
|
| 224 |
+
tags=["history"])
|
| 225 |
+
app.include_router(
|
| 226 |
+
routes_governance.router,
|
| 227 |
+
prefix="/api/v1",
|
| 228 |
+
tags=["governance"])
|
| 229 |
+
app.include_router(
|
| 230 |
+
routes_memory.router,
|
| 231 |
+
prefix="/v1/memory",
|
| 232 |
+
tags=["memory"])
|
| 233 |
+
app.include_router(
|
| 234 |
+
routes_admin.router,
|
| 235 |
+
prefix="/api/v1",
|
| 236 |
+
tags=["admin"])
|
| 237 |
+
app.include_router(
|
| 238 |
+
routes_payments.router,
|
| 239 |
+
prefix="/api/v1",
|
| 240 |
+
tags=["payments"])
|
| 241 |
+
app.include_router(
|
| 242 |
+
webhooks.router,
|
| 243 |
+
tags=["webhooks"])
|
| 244 |
+
app.include_router(
|
| 245 |
+
routes_users.router, # <-- ADDED
|
| 246 |
+
prefix="/api/v1",
|
| 247 |
+
tags=["users"])
|
| 248 |
+
logger.debug("All API routers included")
|
| 249 |
+
|
| 250 |
+
@app.get("/health", tags=["health"])
|
| 251 |
+
async def health() -> Dict[str, str]:
|
| 252 |
+
return {"status": "ok"}
|
| 253 |
+
|
| 254 |
+
return app
|
| 255 |
+
|
| 256 |
+
|
| 257 |
+
app = create_app()
|
app/services/__init__.py
ADDED
|
File without changes
|
app/services/incident_service.py
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from agentic_reliability_framework.core.reliability_signal import signal_to_reliability
|
| 2 |
+
from app.models.incident_models import IncidentReport
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
def process_incident(report: IncidentReport) -> float:
|
| 6 |
+
reliability = signal_to_reliability(report.value, signal_type=report.signal_type)
|
| 7 |
+
return reliability
|
app/services/intent_adapter.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from agentic_reliability_framework.core.governance.intents import (
|
| 2 |
+
ProvisionResourceIntent,
|
| 3 |
+
GrantAccessIntent,
|
| 4 |
+
DeployConfigurationIntent,
|
| 5 |
+
)
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def to_oss_intent(api_request):
|
| 9 |
+
if api_request.intent_type == "provision_resource":
|
| 10 |
+
return ProvisionResourceIntent(
|
| 11 |
+
resource_type=api_request.resource_type,
|
| 12 |
+
region=api_request.region,
|
| 13 |
+
size=api_request.size,
|
| 14 |
+
configuration=api_request.configuration,
|
| 15 |
+
environment=api_request.environment,
|
| 16 |
+
requester=api_request.requester,
|
| 17 |
+
provenance=api_request.provenance,
|
| 18 |
+
)
|
| 19 |
+
elif api_request.intent_type == "grant_access":
|
| 20 |
+
return GrantAccessIntent(
|
| 21 |
+
principal=api_request.principal,
|
| 22 |
+
permission_level=api_request.permission_level,
|
| 23 |
+
resource_scope=api_request.resource_scope,
|
| 24 |
+
justification=api_request.justification,
|
| 25 |
+
requester=api_request.requester,
|
| 26 |
+
provenance=api_request.provenance,
|
| 27 |
+
)
|
| 28 |
+
elif api_request.intent_type == "deploy_config":
|
| 29 |
+
return DeployConfigurationIntent(
|
| 30 |
+
service_name=api_request.service_name,
|
| 31 |
+
change_scope=api_request.change_scope,
|
| 32 |
+
deployment_target=api_request.deployment_target,
|
| 33 |
+
risk_level_hint=api_request.risk_level_hint,
|
| 34 |
+
configuration=api_request.configuration,
|
| 35 |
+
requester=api_request.requester,
|
| 36 |
+
provenance=api_request.provenance,
|
| 37 |
+
)
|
| 38 |
+
else:
|
| 39 |
+
raise ValueError(f"Unknown intent type: {api_request.intent_type}")
|
app/services/intent_service.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import random
|
| 2 |
+
import logging
|
| 3 |
+
from app.models.intent_models import IntentSimulation
|
| 4 |
+
|
| 5 |
+
logger = logging.getLogger(__name__)
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
# Note: This endpoint is deprecated. Use /v1/intents/evaluate instead.
|
| 9 |
+
def simulate_intent(intent: IntentSimulation) -> dict:
|
| 10 |
+
logger.warning("Deprecated endpoint /simulate_intent used. Please migrate to /v1/intents/evaluate.")
|
| 11 |
+
# For backward compatibility, we still use random risk.
|
| 12 |
+
risk_score = random.uniform(0, 1)
|
| 13 |
+
if risk_score < 0.2:
|
| 14 |
+
recommendation = "safe_to_execute"
|
| 15 |
+
elif risk_score < 0.6:
|
| 16 |
+
recommendation = "requires_approval"
|
| 17 |
+
else:
|
| 18 |
+
recommendation = "blocked"
|
| 19 |
+
return {"risk_score": risk_score, "recommendation": recommendation}
|
app/services/intent_store.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import datetime
|
| 2 |
+
from sqlalchemy.orm import Session
|
| 3 |
+
from app.database.models_intents import IntentDB
|
| 4 |
+
from typing import Any, Dict, Optional
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def save_evaluated_intent(
|
| 8 |
+
db: Session,
|
| 9 |
+
deterministic_id: str,
|
| 10 |
+
intent_type: str,
|
| 11 |
+
api_payload: Dict[str, Any],
|
| 12 |
+
oss_payload: Dict[str, Any],
|
| 13 |
+
environment: str,
|
| 14 |
+
risk_score: float
|
| 15 |
+
) -> IntentDB:
|
| 16 |
+
existing = db.query(IntentDB).filter(IntentDB.deterministic_id == deterministic_id).one_or_none()
|
| 17 |
+
if existing:
|
| 18 |
+
existing.evaluated_at = datetime.datetime.utcnow()
|
| 19 |
+
existing.risk_score = str(risk_score)
|
| 20 |
+
existing.oss_payload = oss_payload
|
| 21 |
+
db.add(existing)
|
| 22 |
+
db.commit()
|
| 23 |
+
db.refresh(existing)
|
| 24 |
+
return existing
|
| 25 |
+
|
| 26 |
+
intent = IntentDB(
|
| 27 |
+
deterministic_id=deterministic_id,
|
| 28 |
+
intent_type=intent_type,
|
| 29 |
+
payload=api_payload,
|
| 30 |
+
oss_payload=oss_payload,
|
| 31 |
+
environment=environment,
|
| 32 |
+
evaluated_at=datetime.datetime.utcnow(),
|
| 33 |
+
risk_score=str(risk_score)
|
| 34 |
+
)
|
| 35 |
+
db.add(intent)
|
| 36 |
+
db.commit()
|
| 37 |
+
db.refresh(intent)
|
| 38 |
+
return intent
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def get_intent_by_deterministic_id(db: Session, deterministic_id: str) -> Optional[IntentDB]:
|
| 42 |
+
return db.query(IntentDB).filter(IntentDB.deterministic_id == deterministic_id).one_or_none()
|
app/services/outcome_service.py
ADDED
|
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import datetime
|
| 2 |
+
import logging
|
| 3 |
+
from typing import Optional, Dict, Any
|
| 4 |
+
|
| 5 |
+
from sqlalchemy.orm import Session
|
| 6 |
+
|
| 7 |
+
from agentic_reliability_framework.core.governance.risk_engine import RiskEngine
|
| 8 |
+
from agentic_reliability_framework.core.governance.intents import (
|
| 9 |
+
InfrastructureIntent,
|
| 10 |
+
ProvisionResourceIntent,
|
| 11 |
+
GrantAccessIntent,
|
| 12 |
+
DeployConfigurationIntent,
|
| 13 |
+
)
|
| 14 |
+
from app.database.models_intents import IntentDB, OutcomeDB
|
| 15 |
+
|
| 16 |
+
logger = logging.getLogger(__name__)
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class OutcomeConflictError(Exception):
|
| 20 |
+
pass
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def reconstruct_oss_intent_from_json(oss_json: Dict[str, Any]) -> InfrastructureIntent:
|
| 24 |
+
intent_type = oss_json.get("intent_type")
|
| 25 |
+
if intent_type == "provision_resource":
|
| 26 |
+
return ProvisionResourceIntent(**oss_json)
|
| 27 |
+
elif intent_type == "grant_access":
|
| 28 |
+
return GrantAccessIntent(**oss_json)
|
| 29 |
+
elif intent_type == "deploy_config":
|
| 30 |
+
return DeployConfigurationIntent(**oss_json)
|
| 31 |
+
else:
|
| 32 |
+
raise ValueError(
|
| 33 |
+
f"Cannot reconstruct intent from JSON: missing or unknown intent_type {intent_type}"
|
| 34 |
+
)
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def _create_dummy_intent(intent_type: str) -> Optional[InfrastructureIntent]:
|
| 38 |
+
"""Create a valid dummy intent for a given intent type.
|
| 39 |
+
For now, only ProvisionResourceIntent is fully supported.
|
| 40 |
+
"""
|
| 41 |
+
from agentic_reliability_framework.core.governance.intents import (
|
| 42 |
+
ProvisionResourceIntent,
|
| 43 |
+
)
|
| 44 |
+
if intent_type == "ProvisionResourceIntent":
|
| 45 |
+
# Use string values directly; they must be valid according to the model
|
| 46 |
+
return ProvisionResourceIntent(
|
| 47 |
+
resource_type="vm",
|
| 48 |
+
region="eastus",
|
| 49 |
+
size="Standard_D2s_v3",
|
| 50 |
+
environment="dev", # Use string instead of Environment.dev
|
| 51 |
+
requester="system"
|
| 52 |
+
)
|
| 53 |
+
else:
|
| 54 |
+
logger.warning("Dummy intent creation not implemented for %s", intent_type)
|
| 55 |
+
return None
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
def record_outcome(
|
| 59 |
+
db: Session,
|
| 60 |
+
deterministic_id: str,
|
| 61 |
+
success: bool,
|
| 62 |
+
recorded_by: Optional[str],
|
| 63 |
+
notes: Optional[str],
|
| 64 |
+
risk_engine: RiskEngine
|
| 65 |
+
) -> OutcomeDB:
|
| 66 |
+
intent = db.query(IntentDB).filter(IntentDB.deterministic_id == deterministic_id).one_or_none()
|
| 67 |
+
if not intent:
|
| 68 |
+
raise ValueError(f"Intent not found: {deterministic_id}")
|
| 69 |
+
|
| 70 |
+
existing_outcome = db.query(OutcomeDB).filter(OutcomeDB.intent_id == intent.id).one_or_none()
|
| 71 |
+
if existing_outcome:
|
| 72 |
+
if existing_outcome.success == success:
|
| 73 |
+
return existing_outcome
|
| 74 |
+
raise OutcomeConflictError("Outcome already recorded with different result")
|
| 75 |
+
|
| 76 |
+
outcome = OutcomeDB(
|
| 77 |
+
intent_id=intent.id,
|
| 78 |
+
success=bool(success),
|
| 79 |
+
recorded_by=recorded_by,
|
| 80 |
+
notes=notes,
|
| 81 |
+
recorded_at=datetime.datetime.utcnow() # will be replaced with timezone-aware later
|
| 82 |
+
)
|
| 83 |
+
db.add(outcome)
|
| 84 |
+
db.commit()
|
| 85 |
+
db.refresh(outcome)
|
| 86 |
+
|
| 87 |
+
# Determine OSS intent for risk engine update
|
| 88 |
+
oss_intent = None
|
| 89 |
+
if intent.oss_payload:
|
| 90 |
+
try:
|
| 91 |
+
oss_intent = reconstruct_oss_intent_from_json(intent.oss_payload)
|
| 92 |
+
except Exception as e:
|
| 93 |
+
logger.warning(
|
| 94 |
+
"Failed to reconstruct OSS intent for %s: %s. Using dummy fallback.",
|
| 95 |
+
deterministic_id, e
|
| 96 |
+
)
|
| 97 |
+
oss_intent = _create_dummy_intent(intent.intent_type)
|
| 98 |
+
else:
|
| 99 |
+
oss_intent = _create_dummy_intent(intent.intent_type)
|
| 100 |
+
|
| 101 |
+
# Update risk engine if we have an intent
|
| 102 |
+
if oss_intent is not None:
|
| 103 |
+
try:
|
| 104 |
+
risk_engine.update_outcome(oss_intent, success)
|
| 105 |
+
except Exception as e:
|
| 106 |
+
logger.exception(
|
| 107 |
+
"Failed to update RiskEngine after recording outcome for intent %s: %s",
|
| 108 |
+
deterministic_id, e
|
| 109 |
+
)
|
| 110 |
+
else:
|
| 111 |
+
logger.error(
|
| 112 |
+
"No valid OSS intent available for risk engine update; skipping outcome for %s",
|
| 113 |
+
deterministic_id
|
| 114 |
+
)
|
| 115 |
+
|
| 116 |
+
return outcome
|
app/services/risk_service.py
ADDED
|
@@ -0,0 +1,158 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from agentic_reliability_framework.core.governance.risk_engine import RiskEngine
|
| 2 |
+
from agentic_reliability_framework.core.governance.intents import InfrastructureIntent
|
| 3 |
+
from typing import Optional, List, Dict, Any
|
| 4 |
+
|
| 5 |
+
from agentic_reliability_framework.core.models.event import ReliabilityEvent, HealingAction
|
| 6 |
+
from agentic_reliability_framework.core.governance.policy_engine import PolicyEngine
|
| 7 |
+
from agentic_reliability_framework.core.decision.decision_engine import DecisionEngine
|
| 8 |
+
from agentic_reliability_framework.runtime.memory.rag_graph import RAGGraphMemory
|
| 9 |
+
|
| 10 |
+
# NEW: Import eclipse probe
|
| 11 |
+
from agentic_reliability_framework.core.research.eclipse_probe import compute_epistemic_risk
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def evaluate_intent(
|
| 15 |
+
engine: RiskEngine,
|
| 16 |
+
intent: InfrastructureIntent,
|
| 17 |
+
cost_estimate: Optional[float],
|
| 18 |
+
policy_violations: List[str]
|
| 19 |
+
) -> dict:
|
| 20 |
+
"""
|
| 21 |
+
Evaluate an infrastructure intent using the Bayesian risk engine.
|
| 22 |
+
Returns a dictionary with risk score, explanation, and contributions.
|
| 23 |
+
"""
|
| 24 |
+
score, explanation, contributions = engine.calculate_risk(
|
| 25 |
+
intent=intent,
|
| 26 |
+
cost_estimate=cost_estimate,
|
| 27 |
+
policy_violations=policy_violations
|
| 28 |
+
)
|
| 29 |
+
return {
|
| 30 |
+
"risk_score": score,
|
| 31 |
+
"explanation": explanation,
|
| 32 |
+
"contributions": contributions
|
| 33 |
+
}
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def evaluate_healing_decision(
|
| 37 |
+
event: ReliabilityEvent,
|
| 38 |
+
policy_engine: PolicyEngine,
|
| 39 |
+
decision_engine: Optional[DecisionEngine] = None,
|
| 40 |
+
rag_graph: Optional[RAGGraphMemory] = None,
|
| 41 |
+
model=None, # NEW: optional HuggingFace model
|
| 42 |
+
tokenizer=None, # NEW: optional tokenizer
|
| 43 |
+
) -> Dict[str, Any]:
|
| 44 |
+
"""
|
| 45 |
+
Evaluate healing actions for a given reliability event using decision‑theoretic selection.
|
| 46 |
+
Now includes epistemic risk signals from the eclipse probe.
|
| 47 |
+
|
| 48 |
+
Returns:
|
| 49 |
+
Dictionary with keys: risk_score, selected_action, expected_utility, alternatives,
|
| 50 |
+
explanation, epistemic_signals (new).
|
| 51 |
+
"""
|
| 52 |
+
# If decision_engine not provided, try to get from policy_engine
|
| 53 |
+
if decision_engine is None and hasattr(policy_engine, 'decision_engine'):
|
| 54 |
+
decision_engine = policy_engine.decision_engine
|
| 55 |
+
|
| 56 |
+
# If still None, create a minimal one (global stats only)
|
| 57 |
+
if decision_engine is None:
|
| 58 |
+
decision_engine = DecisionEngine(rag_graph=rag_graph)
|
| 59 |
+
|
| 60 |
+
# Get raw candidate actions (by temporarily disabling decision engine)
|
| 61 |
+
orig_use = policy_engine.use_decision_engine
|
| 62 |
+
try:
|
| 63 |
+
policy_engine.use_decision_engine = False
|
| 64 |
+
raw_actions = policy_engine.evaluate_policies(event)
|
| 65 |
+
finally:
|
| 66 |
+
policy_engine.use_decision_engine = orig_use
|
| 67 |
+
|
| 68 |
+
# If no actions, return NO_ACTION
|
| 69 |
+
if not raw_actions or raw_actions == [HealingAction.NO_ACTION]:
|
| 70 |
+
return {
|
| 71 |
+
"risk_score": 0.0,
|
| 72 |
+
"selected_action": HealingAction.NO_ACTION.value,
|
| 73 |
+
"expected_utility": 0.0,
|
| 74 |
+
"alternatives": [],
|
| 75 |
+
"explanation": "No candidate actions triggered.",
|
| 76 |
+
"epistemic_signals": None,
|
| 77 |
+
}
|
| 78 |
+
|
| 79 |
+
# === NEW: Compute epistemic signals from triggered policies ===
|
| 80 |
+
# Build reasoning text from the policies that triggered the actions
|
| 81 |
+
reasoning_parts = []
|
| 82 |
+
for policy in policy_engine.policies:
|
| 83 |
+
# Check if any of the policy's actions are in raw_actions
|
| 84 |
+
if any(a in policy.actions for a in raw_actions):
|
| 85 |
+
conditions_str = ", ".join(
|
| 86 |
+
f"{c.metric} {c.operator} {c.threshold}" for c in policy.conditions
|
| 87 |
+
)
|
| 88 |
+
reasoning_parts.append(
|
| 89 |
+
f"Policy {policy.name} triggered by {conditions_str} → actions {[a.value for a in policy.actions]}"
|
| 90 |
+
)
|
| 91 |
+
reasoning_text = " ".join(reasoning_parts)
|
| 92 |
+
|
| 93 |
+
# Build evidence text from the event
|
| 94 |
+
evidence_text = (
|
| 95 |
+
f"Component: {event.component}, "
|
| 96 |
+
f"latency_p99: {event.latency_p99}, "
|
| 97 |
+
f"error_rate: {event.error_rate}, "
|
| 98 |
+
f"cpu_util: {event.cpu_util}, "
|
| 99 |
+
f"memory_util: {event.memory_util}"
|
| 100 |
+
)
|
| 101 |
+
|
| 102 |
+
# Compute epistemic signals (if model/tokenizer provided)
|
| 103 |
+
epistemic_signals = None
|
| 104 |
+
if model is not None and tokenizer is not None:
|
| 105 |
+
epistemic_signals = compute_epistemic_risk(
|
| 106 |
+
reasoning_text, evidence_text, model, tokenizer
|
| 107 |
+
)
|
| 108 |
+
else:
|
| 109 |
+
# In OSS, we may not have model; use zeros as fallback
|
| 110 |
+
epistemic_signals = {
|
| 111 |
+
"entropy": 0.0,
|
| 112 |
+
"contradiction": 0.0,
|
| 113 |
+
"evidence_lift": 0.0,
|
| 114 |
+
"hallucination_risk": 0.0,
|
| 115 |
+
}
|
| 116 |
+
|
| 117 |
+
# Run decision engine to get best action and alternatives, passing epistemic signals
|
| 118 |
+
decision = decision_engine.select_optimal_action(
|
| 119 |
+
raw_actions, event, component=event.component,
|
| 120 |
+
epistemic_signals=epistemic_signals
|
| 121 |
+
)
|
| 122 |
+
|
| 123 |
+
# Risk of the selected action
|
| 124 |
+
risk_score = None
|
| 125 |
+
for alt in decision.alternatives:
|
| 126 |
+
if alt.action == decision.best_action:
|
| 127 |
+
risk_score = alt.risk
|
| 128 |
+
break
|
| 129 |
+
if risk_score is None:
|
| 130 |
+
# Compute risk separately
|
| 131 |
+
risk_score = decision_engine.compute_risk(decision.best_action, event, event.component)
|
| 132 |
+
|
| 133 |
+
# Format alternatives (top 3 only)
|
| 134 |
+
alt_list = []
|
| 135 |
+
for alt in decision.alternatives[:3]:
|
| 136 |
+
alt_list.append({
|
| 137 |
+
"action": alt.action.value,
|
| 138 |
+
"expected_utility": alt.utility,
|
| 139 |
+
"risk": alt.risk,
|
| 140 |
+
})
|
| 141 |
+
|
| 142 |
+
# Build final response
|
| 143 |
+
response = {
|
| 144 |
+
"risk_score": risk_score,
|
| 145 |
+
"selected_action": decision.best_action.value,
|
| 146 |
+
"expected_utility": decision.expected_utility,
|
| 147 |
+
"alternatives": alt_list,
|
| 148 |
+
"explanation": decision.explanation,
|
| 149 |
+
"raw_decision": decision.raw_data,
|
| 150 |
+
"epistemic_signals": epistemic_signals, # NEW
|
| 151 |
+
}
|
| 152 |
+
return response
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
def get_system_risk() -> float:
|
| 156 |
+
# Placeholder – this endpoint is being deprecated; we keep it for backward compatibility.
|
| 157 |
+
import random
|
| 158 |
+
return round(random.uniform(0, 1), 2)
|