Spaces:

A-R-F
/

Agentic-Reliability-Framework-API

Running

App Files Files Community

petter2025 commited on 6 days ago

Commit

2d521fd

verified ·

1 Parent(s): bc93cf1

Add FastAPI app

Browse files

Files changed (30) hide show

app/__init__.py +0 -0
app/api/__init__.py +0 -0
app/api/deps.py +104 -0
app/api/routes_admin.py +123 -0
app/api/routes_governance.py +189 -0
app/api/routes_history.py +9 -0
app/api/routes_incidents.py +131 -0
app/api/routes_intents.py +14 -0
app/api/routes_memory.py +24 -0
app/api/routes_payments.py +54 -0
app/api/routes_risk.py +40 -0
app/api/routes_users.py +56 -0
app/api/webhooks.py +48 -0
app/causal_explainer.py +164 -0
app/core/__init__.py +0 -0
app/core/config.py +23 -0
app/core/storage.py +2 -0
app/core/usage_tracker.py +347 -0
app/database/__init__.py +0 -0
app/database/base.py +2 -0
app/database/models_intents.py +33 -0
app/database/session.py +6 -0
app/main.py +257 -0
app/services/__init__.py +0 -0
app/services/incident_service.py +7 -0
app/services/intent_adapter.py +39 -0
app/services/intent_service.py +19 -0
app/services/intent_store.py +42 -0
app/services/outcome_service.py +116 -0
app/services/risk_service.py +158 -0

app/__init__.py ADDED Viewed

File without changes

app/api/__init__.py ADDED Viewed

File without changes

app/api/deps.py ADDED Viewed

	@@ -0,0 +1,104 @@

+import sys
+from app.database.session import SessionLocal
+from slowapi import Limiter
+from slowapi.util import get_remote_address
+from app.core.config import settings
+# ARF core engine imports
+from agentic_reliability_framework.core.governance.risk_engine import RiskEngine
+from agentic_reliability_framework.core.decision.decision_engine import DecisionEngine
+from agentic_reliability_framework.core.governance.stability_controller import LyapunovStabilityController
+from agentic_reliability_framework.core.governance.causal_explainer import CausalExplainer
+from agentic_reliability_framework.runtime.memory.rag_graph import RAGGraphMemory
+from agentic_reliability_framework.core.models.event import ReliabilityEvent, HealingAction
+# Dependency to get DB session
+def get_db():
+    db = SessionLocal()
+    try:
+        yield db
+    finally:
+        db.close()
+# Rate limiter with default limit from settings
+limiter = Limiter(key_func=get_remote_address, default_limits=[settings.RATE_LIMIT])
+# ARF engine dependencies (singletons for simplicity)
+_risk_engine = None
+_decision_engine = None
+_stability_controller = None
+_causal_explainer = None
+_rag_graph = None
+def _seed_rag_graph(rag):
+    """Seed the RAG graph with historical healing action outcomes."""
+    seed_data = [
+        ("seed_restart_1", "test", HealingAction.RESTART_CONTAINER.value, True, 2),
+        ("seed_restart_2", "test", HealingAction.RESTART_CONTAINER.value, True, 3),
+        ("seed_restart_3", "test", HealingAction.RESTART_CONTAINER.value, False, 10),
+        ("seed_rollback_1", "test", HealingAction.ROLLBACK.value, True, 1),
+        ("seed_rollback_2", "test", HealingAction.ROLLBACK.value, True, 2),
+        ("seed_rollback_3", "test", HealingAction.ROLLBACK.value, False, 5),
+        ("seed_scale_1", "test", HealingAction.SCALE_OUT.value, True, 5),
+        ("seed_scale_2", "test", HealingAction.SCALE_OUT.value, False, 15),
+        ("seed_cb_1", "test", HealingAction.CIRCUIT_BREAKER.value, True, 1),
+        ("seed_cb_2", "test", HealingAction.CIRCUIT_BREAKER.value, True, 2),
+        ("seed_ts_1", "test", HealingAction.TRAFFIC_SHIFT.value, True, 4),
+        ("seed_ts_2", "test", HealingAction.TRAFFIC_SHIFT.value, False, 8),
+    ]
+    for inc_id, comp, action, success, res_time in seed_data:
+        event = ReliabilityEvent(
+            component=comp,
+            latency_p99=500,
+            error_rate=0.1,
+            service_mesh="default"
+        )
+        rag.record_outcome(
+            incident_id=inc_id,
+            event=event,
+            action_taken=action,
+            success=success,
+            resolution_time_minutes=res_time
+        )
+    print("Seeded RAG graph with historical data", file=sys.stderr)
+def get_rag_graph():
+    global _rag_graph
+    if _rag_graph is None:
+        _rag_graph = RAGGraphMemory()
+        _seed_rag_graph(_rag_graph)
+    return _rag_graph
+def get_decision_engine():
+    global _decision_engine
+    if _decision_engine is None:
+        rag = get_rag_graph()
+        _decision_engine = DecisionEngine(rag_graph=rag)
+    return _decision_engine
+def get_risk_engine():
+    global _risk_engine
+    if _risk_engine is None:
+        _risk_engine = RiskEngine()
+    return _risk_engine
+def get_stability_controller():
+    global _stability_controller
+    if _stability_controller is None:
+        _stability_controller = LyapunovStabilityController()
+    return _stability_controller
+def get_causal_explainer():
+    global _causal_explainer
+    if _causal_explainer is None:
+        _causal_explainer = CausalExplainer()
+    return _causal_explainer

app/api/routes_admin.py ADDED Viewed

	@@ -0,0 +1,123 @@

+"""
+Admin API endpoints for API key management and audit logs.
+These endpoints should be protected (e.g., by an admin API key) in production.
+"""
+from fastapi import APIRouter, Depends, HTTPException, Query, Path, Body
+from pydantic import BaseModel
+from typing import Optional, List, Dict, Any
+from datetime import datetime
+import uuid
+from app.core.usage_tracker import tracker, Tier
+router = APIRouter(prefix="/admin", tags=["admin"])
+# Simple in‑memory admin key (replace with proper auth in production)
+ADMIN_API_KEY = "admin_secret_change_me"
+def verify_admin(admin_key: str = Query(..., alias="admin_key")):
+    if admin_key != ADMIN_API_KEY:
+        raise HTTPException(status_code=403, detail="Invalid admin key")
+    return True
+class CreateKeyRequest(BaseModel):
+    tier: str
+class UpdateTierRequest(BaseModel):
+    tier: str
+@router.post("/keys", dependencies=[Depends(verify_admin)])
+async def create_api_key(req: CreateKeyRequest):
+    if req.tier not in [t.value for t in Tier]:
+        raise HTTPException(status_code=400, detail=f"Invalid tier. Must be one of {[t.value for t in Tier]}")
+    new_key = f"sk_live_{uuid.uuid4().hex[:24]}"
+    tier_enum = Tier(req.tier)
+    tracker.get_or_create_api_key(new_key, tier_enum)
+    return {"api_key": new_key, "tier": req.tier}
+@router.get("/keys", dependencies=[Depends(verify_admin)])
+async def list_api_keys(limit: int = 100, offset: int = 0):
+    with tracker._get_conn() as conn:
+        rows = conn.execute(
+            "SELECT key, tier, created_at, last_used_at, is_active FROM api_keys ORDER BY created_at DESC LIMIT ? OFFSET ?",
+            (limit, offset)
+        ).fetchall()
+        keys = []
+        for row in rows:
+            month = tracker._get_month_key()
+            usage_row = conn.execute(
+                "SELECT count FROM monthly_counts WHERE api_key = ? AND year_month = ?",
+                (row["key"], month)
+            ).fetchone()
+            usage = usage_row["count"] if usage_row else 0
+            keys.append({
+                "key": row["key"],
+                "tier": row["tier"],
+                "created_at": datetime.fromtimestamp(row["created_at"]).isoformat(),
+                "last_used_at": datetime.fromtimestamp(row["last_used_at"]).isoformat() if row["last_used_at"] else None,
+                "is_active": bool(row["is_active"]),
+                "current_month_usage": usage,
+            })
+        return {"keys": keys, "total": len(keys)}
+@router.patch("/keys/{api_key}/tier", dependencies=[Depends(verify_admin)])
+async def update_key_tier(
+    api_key: str = Path(..., description="The API key to update"),
+    req: UpdateTierRequest = Body(...),
+):
+    if req.tier not in [t.value for t in Tier]:
+        raise HTTPException(status_code=400, detail=f"Invalid tier. Must be one of {[t.value for t in Tier]}")
+    with tracker._get_conn() as conn:
+        row = conn.execute("SELECT key FROM api_keys WHERE key = ?", (api_key,)).fetchone()
+        if not row:
+            raise HTTPException(status_code=404, detail="API key not found")
+        conn.execute("UPDATE api_keys SET tier = ? WHERE key = ?", (req.tier, api_key))
+        conn.commit()
+    return {"message": f"Tier updated to {req.tier}"}
+@router.delete("/keys/{api_key}", dependencies=[Depends(verify_admin)])
+async def deactivate_api_key(api_key: str = Path(..., description="The API key to deactivate")):
+    with tracker._get_conn() as conn:
+        row = conn.execute("SELECT key FROM api_keys WHERE key = ?", (api_key,)).fetchone()
+        if not row:
+            raise HTTPException(status_code=404, detail="API key not found")
+        conn.execute("UPDATE api_keys SET is_active = 0 WHERE key = ?", (api_key,))
+        conn.commit()
+    return {"message": "API key deactivated"}
+@router.get("/audit/{api_key}", dependencies=[Depends(verify_admin)])
+async def get_audit_logs(
+    api_key: str = Path(..., description="The API key to audit"),
+    start_date: Optional[str] = Query(None),
+    end_date: Optional[str] = Query(None),
+    limit: int = 100,
+):
+    start = datetime.fromisoformat(start_date) if start_date else None
+    end = datetime.fromisoformat(end_date) if end_date else None
+    logs = tracker.get_audit_logs(api_key, start, end, limit)
+    return {"api_key": api_key, "logs": logs}
+@router.get("/stats", dependencies=[Depends(verify_admin)])
+async def get_global_stats():
+    with tracker._get_conn() as conn:
+        total_keys = conn.execute("SELECT COUNT(*) FROM api_keys WHERE is_active = 1").fetchone()[0]
+        total_requests = conn.execute("SELECT COUNT(*) FROM usage_log").fetchone()[0]
+        by_tier = conn.execute(
+            "SELECT tier, COUNT(*) as count FROM usage_log GROUP BY tier"
+        ).fetchall()
+        month = tracker._get_month_key()
+        current_month_requests = conn.execute(
+            "SELECT SUM(count) FROM monthly_counts WHERE year_month = ?", (month,)
+        ).fetchone()[0] or 0
+    return {
+        "active_api_keys": total_keys,
+        "total_evaluations": total_requests,
+        "current_month_evaluations": current_month_requests,
+        "by_tier": [{"tier": row[0], "count": row[1]} for row in by_tier],
+    }

app/api/routes_governance.py ADDED Viewed

	@@ -0,0 +1,189 @@

+from fastapi import APIRouter, Depends, HTTPException, Request, BackgroundTasks
+from fastapi.encoders import jsonable_encoder
+from sqlalchemy.orm import Session
+from app.models.infrastructure_intents import InfrastructureIntentRequest
+from app.services.intent_adapter import to_oss_intent
+from app.services.risk_service import evaluate_intent, evaluate_healing_decision
+from app.services.intent_store import save_evaluated_intent
+from app.services.outcome_service import record_outcome
+from app.api.deps import get_db
+from pydantic import BaseModel
+import uuid
+import logging
+import time
+from agentic_reliability_framework.core.models.event import ReliabilityEvent
+# ===== USAGE TRACKER IMPORTS =====
+from app.core.usage_tracker import enforce_quota, UsageRecord, tracker
+logger = logging.getLogger(__name__)
+router = APIRouter()
+class OutcomeRequest(BaseModel):
+    deterministic_id: str
+    success: bool
+    recorded_by: str
+    notes: str = ""
+class HealingDecisionRequest(BaseModel):
+    event: ReliabilityEvent
+@router.post("/intents/evaluate")
+async def evaluate_intent_endpoint(
+    request: Request,
+    intent_req: InfrastructureIntentRequest,
+    background_tasks: BackgroundTasks,
+    db: Session = Depends(get_db),
+    quota: dict = Depends(enforce_quota)
+):
+    start_time = time.time()
+    api_key = quota["api_key"]
+    tier = quota["tier"]
+    response_data = None
+    error_msg = None
+    try:
+        oss_intent = to_oss_intent(intent_req)
+        risk_engine = request.app.state.risk_engine
+        result = evaluate_intent(
+            engine=risk_engine,
+            intent=oss_intent,
+            cost_estimate=intent_req.estimated_cost,
+            policy_violations=intent_req.policy_violations
+        )
+        deterministic_id = str(uuid.uuid4())
+        api_payload = jsonable_encoder(intent_req.model_dump())
+        oss_payload = jsonable_encoder(oss_intent.model_dump())
+        save_evaluated_intent(
+            db=db,
+            deterministic_id=deterministic_id,
+            intent_type=intent_req.intent_type,
+            api_payload=api_payload,
+            oss_payload=oss_payload,
+            environment=str(intent_req.environment),
+            risk_score=result["risk_score"]
+        )
+        result["intent_id"] = deterministic_id
+        response_data = result
+        if tracker:
+            record = UsageRecord(
+                api_key=api_key,
+                tier=tier,
+                timestamp=time.time(),
+                endpoint="/api/v1/intents/evaluate",
+                request_body=intent_req.model_dump(),
+                response=response_data,
+                processing_ms=(time.time() - start_time) * 1000,
+            )
+            await tracker.increment_usage_async(record, background_tasks)
+        return response_data
+    except HTTPException:
+        raise
+    except Exception as e:
+        error_msg = str(e)
+        logger.exception("Error in evaluate_intent_endpoint")
+        if tracker:
+            record = UsageRecord(
+                api_key=api_key,
+                tier=tier,
+                timestamp=time.time(),
+                endpoint="/api/v1/intents/evaluate",
+                request_body=intent_req.model_dump(),
+                error=error_msg,
+                processing_ms=(time.time() - start_time) * 1000,
+            )
+            await tracker.increment_usage_async(record, background_tasks)
+        raise HTTPException(status_code=500, detail=error_msg)
+@router.post("/intents/outcome")
+async def record_outcome_endpoint(
+    request: Request,
+    outcome: OutcomeRequest,
+    db: Session = Depends(get_db)
+):
+    # No usage tracking for outcomes (doesn't count against quota)
+    try:
+        risk_engine = request.app.state.risk_engine
+        outcome_record = record_outcome(
+            db=db,
+            deterministic_id=outcome.deterministic_id,
+            success=outcome.success,
+            recorded_by=outcome.recorded_by,
+            notes=outcome.notes,
+            risk_engine=risk_engine
+        )
+        return {"message": "Outcome recorded", "outcome_id": outcome_record.id}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@router.post("/healing/evaluate")
+async def evaluate_healing_decision_endpoint(
+    request: Request,
+    decision_req: HealingDecisionRequest,
+    background_tasks: BackgroundTasks,
+    quota: dict = Depends(enforce_quota)
+):
+    start_time = time.time()
+    api_key = quota["api_key"]
+    tier = quota["tier"]
+    response_data = None
+    error_msg = None
+    try:
+        policy_engine = request.app.state.policy_engine
+        rag_graph = getattr(request.app.state, "rag_graph", None)
+        model = getattr(request.app.state, "epistemic_model", None)
+        tokenizer = getattr(request.app.state, "epistemic_tokenizer", None)
+        response_data = evaluate_healing_decision(
+            event=decision_req.event,
+            policy_engine=policy_engine,
+            decision_engine=None,
+            rag_graph=rag_graph,
+            model=model,
+            tokenizer=tokenizer,
+        )
+        if tracker:
+            record = UsageRecord(
+                api_key=api_key,
+                tier=tier,
+                timestamp=time.time(),
+                endpoint="/api/v1/healing/evaluate",
+                request_body=decision_req.model_dump(),
+                response=response_data,
+                processing_ms=(time.time() - start_time) * 1000,
+            )
+            await tracker.increment_usage_async(record, background_tasks)
+        return response_data
+    except HTTPException:
+        raise
+    except Exception as e:
+        error_msg = str(e)
+        logger.exception("Error in evaluate_healing_decision_endpoint")
+        if tracker:
+            record = UsageRecord(
+                api_key=api_key,
+                tier=tier,
+                timestamp=time.time(),
+                endpoint="/api/v1/healing/evaluate",
+                request_body=decision_req.model_dump(),
+                error=error_msg,
+                processing_ms=(time.time() - start_time) * 1000,
+            )
+            await tracker.increment_usage_async(record, background_tasks)
+        raise HTTPException(status_code=500, detail=error_msg)

app/api/routes_history.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from fastapi import APIRouter
+from app.core.storage import incident_history
+router = APIRouter()
+@router.get("/history")
+async def get_history():
+    return {"incidents": incident_history}

app/api/routes_incidents.py ADDED Viewed

	@@ -0,0 +1,131 @@

+from app.causal_explainer import CausalExplainer
+from fastapi import APIRouter, Depends, Request, BackgroundTasks, HTTPException
+from pydantic import BaseModel
+from typing import Optional
+from enum import Enum
+import time
+import json
+# ===== USAGE TRACKER IMPORTS =====
+from app.core.usage_tracker import enforce_quota, UsageRecord, tracker
+class HealingAction(str, Enum):
+    NO_ACTION = "no_action"
+    RESTART_CONTAINER = "restart_container"
+    SCALE_OUT = "scale_out"
+    ROLLBACK = "rollback"
+    CIRCUIT_BREAKER = "circuit_breaker"
+    TRAFFIC_SHIFT = "traffic_shift"
+    ALERT_TEAM = "alert_team"
+class ReliabilityEvent(BaseModel):
+    component: str
+    latency_p99: float
+    error_rate: float
+    service_mesh: str = "default"
+    cpu_util: Optional[float] = None
+    memory_util: Optional[float] = None
+router = APIRouter()
+incident_history = []
+@router.post("/report_incident")
+async def report_incident(event: ReliabilityEvent):
+    incident_history.append(event.dict())
+    return {"status": "recorded"}
+@router.post("/v1/incidents/evaluate")
+async def evaluate_incident(
+    request: Request,
+    event: ReliabilityEvent,
+    background_tasks: BackgroundTasks,
+    quota: dict = Depends(enforce_quota)
+):
+    start_time = time.time()
+    api_key = quota["api_key"]
+    tier = quota["tier"]
+    response_data = None
+    error_msg = None
+    try:
+        # Simple risk score (heuristic)
+        risk_score = min(1.0, (event.latency_p99 / 1000.0) * 0.7 + event.error_rate * 0.3)
+        if event.latency_p99 > 500 or event.error_rate > 0.15:
+            optimal_action = HealingAction.RESTART_CONTAINER
+        else:
+            optimal_action = HealingAction.NO_ACTION
+        current_state = {
+            "latency": event.latency_p99,
+            "error_rate": event.error_rate,
+            "last_action": {"action_type": "no_action"}
+        }
+        proposed_action = {"action_type": optimal_action.value, "params": {}}
+        ce = CausalExplainer()
+        causal_exp = ce.explain_healing_intent(proposed_action, current_state, "latency")
+        healing_intent = {
+            "action": optimal_action.value,
+            "component": event.component,
+            "parameters": proposed_action["params"],
+            "justification": f"Causal: {causal_exp.explanation_text}",
+            "confidence": 0.85,
+            "risk_score": risk_score,
+            "status": "oss_advisory_only"
+        }
+        response_data = {
+            "healing_intent": healing_intent,
+            "causal_explanation": {
+                "factual_outcome": causal_exp.factual_outcome,
+                "counterfactual_outcome": causal_exp.counterfactual_outcome,
+                "effect": causal_exp.effect,
+                "explanation_text": causal_exp.explanation_text,
+                "is_model_based": causal_exp.is_model_based,
+                "warnings": causal_exp.warnings
+            },
+            "utility_decision": {
+                "best_action": optimal_action.value,
+                "expected_utility": 0.5,
+                "explanation": "Heuristic decision based on latency/error thresholds"
+            }
+        }
+        # Asynchronous usage logging
+        if tracker:
+            record = UsageRecord(
+                api_key=api_key,
+                tier=tier,
+                timestamp=time.time(),
+                endpoint="/v1/incidents/evaluate",
+                request_body=event.dict(),
+                response=response_data,
+                processing_ms=(time.time() - start_time) * 1000,
+            )
+            await tracker.increment_usage_async(record, background_tasks)
+        return response_data
+    except HTTPException:
+        raise
+    except Exception as e:
+        error_msg = str(e)
+        # Log failure in background
+        if tracker:
+            record = UsageRecord(
+                api_key=api_key,
+                tier=tier,
+                timestamp=time.time(),
+                endpoint="/v1/incidents/evaluate",
+                request_body=event.dict(),
+                error=error_msg,
+                processing_ms=(time.time() - start_time) * 1000,
+            )
+            await tracker.increment_usage_async(record, background_tasks)
+        raise HTTPException(status_code=500, detail=error_msg)

app/api/routes_intents.py ADDED Viewed

	@@ -0,0 +1,14 @@

+from fastapi import APIRouter, HTTPException
+from app.models.intent_models import IntentSimulation, IntentSimulationResponse
+from app.services.intent_service import simulate_intent
+router = APIRouter()
+@router.post("/simulate_intent", response_model=IntentSimulationResponse)
+async def simulate_intent_endpoint(intent: IntentSimulation):
+    try:
+        result = simulate_intent(intent)
+        return IntentSimulationResponse(**result)
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))

app/api/routes_memory.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from fastapi import APIRouter, Request
+router = APIRouter()
+@router.get("/stats")
+async def memory_stats(request: Request):
+    """
+    Return current memory graph statistics using the risk engine's memory instance.
+    """
+    risk_engine = request.app.state.risk_engine
+    # Check if memory exists and has the required method
+    if hasattr(risk_engine, 'memory') and hasattr(risk_engine.memory, 'get_graph_stats'):
+        stats = risk_engine.memory.get_graph_stats()
+        return stats
+    else:
+        # Graceful fallback (e.g., during testing or if memory not initialized)
+        return {
+            "incident_nodes": 0,
+            "outcome_nodes": 0,
+            "edges": 0,
+            "message": "Memory not fully initialized"
+        }

app/api/routes_payments.py ADDED Viewed

	@@ -0,0 +1,54 @@

+"""
+Payment endpoints – Stripe Checkout integration.
+"""
+import os
+import stripe
+from fastapi import APIRouter, HTTPException, Request
+from pydantic import BaseModel
+from typing import Optional
+from app.core.config import settings
+from app.core.usage_tracker import tracker, Tier
+router = APIRouter(prefix="/payments", tags=["payments"])
+# Set Stripe API key (from environment)
+stripe.api_key = os.getenv("STRIPE_SECRET_KEY")
+STRIPE_WEBHOOK_SECRET = os.getenv("STRIPE_WEBHOOK_SECRET")
+class CheckoutRequest(BaseModel):
+    api_key: str
+    success_url: str
+    cancel_url: str
+@router.post("/create-checkout-session")
+async def create_checkout_session(req: CheckoutRequest):
+    """Create a Stripe Checkout session for the Pro tier."""
+    if not stripe.api_key:
+        raise HTTPException(status_code=500, detail="Stripe not configured")
+    # Verify the API key exists and is free tier
+    tier = tracker.get_tier(req.api_key) if tracker else None
+    if tier != Tier.FREE:
+        raise HTTPException(status_code=400, detail="Only free tier keys can be upgraded")
+    try:
+        checkout_session = stripe.checkout.Session.create(
+            payment_method_types=["card"],
+            line_items=[
+                {
+                    "price": os.getenv("STRIPE_PRO_PRICE_ID"),  # e.g., "price_123"
+                    "quantity": 1,
+                }
+            ],
+            mode="subscription",
+            success_url=req.success_url,
+            cancel_url=req.cancel_url,
+            metadata={"api_key": req.api_key},
+            client_reference_id=req.api_key,
+        )
+        return {"sessionId": checkout_session.id, "url": checkout_session.url}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))

app/api/routes_risk.py ADDED Viewed

	@@ -0,0 +1,40 @@

+from fastapi import APIRouter, HTTPException
+from app.models.risk_models import RiskResponse
+from app.services.risk_service import get_system_risk
+router = APIRouter()
+@router.get("/get_risk", response_model=RiskResponse)
+async def get_risk():
+    try:
+        risk = get_system_risk()
+        if risk < 0.3:
+            status = "low"
+        elif risk < 0.6:
+            status = "moderate"
+        elif risk < 0.8:
+            status = "high"
+        else:
+            status = "critical"
+        return RiskResponse(system_risk=risk, status=status)
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@router.get("/history")
+async def get_risk_history():
+    """
+    Return dummy historical risk data for the last 24 hours.
+    Replace with real database query later.
+    """
+    import random
+    import datetime
+    now = datetime.datetime.now()
+    data = []
+    for i in range(24, 0, -1):
+        data.append({
+            "time": (now - datetime.timedelta(hours=i)).isoformat(),
+            "risk": round(random.uniform(0.2, 0.8), 2)
+        })
+    return data

app/api/routes_users.py ADDED Viewed

	@@ -0,0 +1,56 @@

+"""
+User endpoints – registration and quota information.
+"""
+import uuid
+from fastapi import APIRouter, Depends, HTTPException, Request
+from slowapi import Limiter
+from slowapi.util import get_remote_address
+from app.core.usage_tracker import tracker, enforce_quota, Tier
+router = APIRouter(prefix="/users", tags=["users"])
+# Rate limiter for registration (5 per hour per IP)
+limiter = Limiter(key_func=get_remote_address, default_limits=["5/hour"])
+@router.post("/register")
+@limiter.limit("5/hour")
+async def register_user(request: Request):
+    """
+    Public endpoint to create a new free‑tier API key.
+    Rate‑limited to 5 requests per hour per IP address.
+    """
+    if tracker is None:
+        raise HTTPException(status_code=503, detail="Usage tracking not available")
+    # Generate a new API key
+    new_key = f"sk_free_{uuid.uuid4().hex[:24]}"
+    # Store it as FREE tier
+    success = tracker.get_or_create_api_key(new_key, Tier.FREE)
+    if not success:
+        raise HTTPException(status_code=500, detail="Failed to create API key")
+    return {
+        "api_key": new_key,
+        "tier": "free",
+        "message": "API key created. Store it securely – you won't see it again."
+    }
+@router.get("/quota")
+async def get_user_quota(request: Request, quota: dict = Depends(enforce_quota)):
+    """
+    Return the current user's tier and remaining evaluation quota.
+    Requires API key in Authorization header.
+    """
+    tier = quota["tier"]
+    remaining = quota["remaining"]
+    limit = tier.monthly_evaluation_limit if tier else None
+    return {
+        "tier": tier.value,
+        "remaining": remaining,
+        "limit": limit,
+    }

app/api/webhooks.py ADDED Viewed

	@@ -0,0 +1,48 @@

+"""
+Stripe webhook handler – updates API key tier on subscription events.
+"""
+import os
+import stripe
+from fastapi import APIRouter, Request, HTTPException
+from app.core.usage_tracker import update_key_tier, Tier
+router = APIRouter(prefix="/webhooks", tags=["webhooks"])
+STRIPE_WEBHOOK_SECRET = os.getenv("STRIPE_WEBHOOK_SECRET")
+stripe.api_key = os.getenv("STRIPE_SECRET_KEY")
+@router.post("/stripe")
+async def stripe_webhook(request: Request):
+    payload = await request.body()
+    sig_header = request.headers.get("stripe-signature")
+    if not STRIPE_WEBHOOK_SECRET or not stripe.api_key:
+        raise HTTPException(status_code=500, detail="Stripe not configured")
+    try:
+        event = stripe.Webhook.construct_event(
+            payload, sig_header, STRIPE_WEBHOOK_SECRET
+        )
+    except ValueError:
+        raise HTTPException(status_code=400, detail="Invalid payload")
+    except stripe.error.SignatureVerificationError:
+        raise HTTPException(status_code=400, detail="Invalid signature")
+    # Handle subscription events
+    if event["type"] == "checkout.session.completed":
+        session = event["data"]["object"]
+        api_key = session.get("client_reference_id") or session.get("metadata", {}).get("api_key")
+        if api_key:
+            update_key_tier(api_key, Tier.PRO)
+    elif event["type"] == "customer.subscription.deleted":
+        subscription = event["data"]["object"]
+        # You need to store a mapping from subscription ID to API key.
+        # For simplicity, we assume you stored it in metadata during checkout.
+        # Alternatively, look up by customer ID.
+        api_key = subscription.get("metadata", {}).get("api_key")
+        if api_key:
+            update_key_tier(api_key, Tier.FREE)
+    return {"status": "ok"}

app/causal_explainer.py ADDED Viewed

	@@ -0,0 +1,164 @@

+"""
+Causal reasoning module for ARF – OSS Edition.
+Provides counterfactual explanations using deterministic heuristics.
+No external causal libraries required.
+"""
+from typing import Dict, Any, Optional, List, Tuple
+from dataclasses import dataclass, field
+import pandas as pd
+@dataclass
+class CausalExplanation:
+    factual_outcome: float
+    counterfactual_outcome: float
+    effect: float
+    is_model_based: bool
+    confidence_interval: Optional[Tuple[float, float]] = None
+    explanation_text: str = ""
+    warnings: List[str] = field(default_factory=list)
+class CausalExplainer:
+    """
+    Heuristic causal explainer for healing actions.
+    Uses domain rules and correlation estimates to produce counterfactuals.
+    """
+    def __init__(self, memory_store=None):
+        self.memory = memory_store
+        self.treatment = "healing_action"   # symbolic name
+        self.outcome = "latency"
+        self._action_impact = {
+            "restart_container": {
+                "latency_effect": -0.15,
+                "error_rate_effect": -0.10},
+            "scale_out": {
+                "latency_effect": -0.20,
+                "error_rate_effect": -0.05},
+            "rollback": {
+                "latency_effect": -0.25,
+                "error_rate_effect": -0.20},
+            "circuit_breaker": {
+                "latency_effect": -0.05,
+                "error_rate_effect": -0.30},
+            "traffic_shift": {
+                "latency_effect": -0.10,
+                "error_rate_effect": -0.10},
+            "alert_team": {
+                "latency_effect": 0.0,
+                "error_rate_effect": 0.0},
+            "no_action": {
+                "latency_effect": 0.0,
+                "error_rate_effect": 0.0},
+        }
+        self._uncertainty = 0.1  # ±10% confidence interval
+    def _extract_action_intensity(self, action_dict: Dict[str, Any]) -> float:
+        action_type = action_dict.get("action_type", "no_action")
+        if action_type == "no_action":
+            return 0.0
+        intensity_map = {
+            "restart_container": 0.4,
+            "scale_out": 0.6,
+            "rollback": 0.8,
+            "circuit_breaker": 0.7,
+            "traffic_shift": 0.5,
+            "alert_team": 0.1,
+        }
+        return intensity_map.get(action_type, 0.0)
+    def _get_effect_for_action(
+            self, action_dict: Dict[str, Any], metric: str) -> float:
+        action_type = action_dict.get("action_type", "no_action")
+        impacts = self._action_impact.get(
+            action_type, self._action_impact["no_action"])
+        if metric == "latency":
+            return impacts["latency_effect"]
+        elif metric == "error_rate":
+            return impacts["error_rate_effect"]
+        return 0.0
+    def counterfactual_explanation(
+        self,
+        observed_context: Dict[str, Any],
+        alternative_action: Dict[str, Any],
+        outcome_name: str = "latency",
+        confidence_level: float = 0.95
+    ) -> CausalExplanation:
+        factual_outcome = observed_context.get(outcome_name, 0.0)
+        factual_action = observed_context.get("action_taken", {})
+        factual_intensity = self._extract_action_intensity(factual_action)
+        alt_intensity = self._extract_action_intensity(alternative_action)
+        effect_frac = self._get_effect_for_action(
+            alternative_action, outcome_name)
+        if alt_intensity == 0.0 and factual_intensity > 0.0:
+            factual_effect = self._get_effect_for_action(
+                factual_action, outcome_name)
+            effect_frac = -factual_effect
+        counterfactual = factual_outcome * (1.0 + effect_frac)
+        counterfactual = max(0.0, counterfactual)
+        effect = counterfactual - factual_outcome
+        ci_half = abs(effect) * self._uncertainty
+        confidence_interval = (
+            counterfactual - ci_half,
+            counterfactual + ci_half)
+        explanation_text = (
+            f"If we apply {
+                alternative_action.get(
+                    'action_type',
+                    'unknown')} instead of " f"{
+                factual_action.get(
+                    'action_type',
+                    'no action')}, {outcome_name} would change " f"from {
+                        factual_outcome:.2f} to {
+                            counterfactual:.2f} (Δ = {
+                                effect:.2f}). " f"Based on heuristic causal model.")
+        return CausalExplanation(
+            factual_outcome=factual_outcome,
+            counterfactual_outcome=counterfactual,
+            effect=effect,
+            is_model_based=False,
+            confidence_interval=confidence_interval,
+            explanation_text=explanation_text,
+            warnings=["Using heuristic causal model (no fitted SCM)."]
+        )
+    def explain_healing_intent(
+        self,
+        proposed_action: Dict[str, Any],
+        current_state: Dict[str, Any],
+        outcome_metric: str = "latency"
+    ) -> CausalExplanation:
+        observed = {
+            outcome_metric: current_state.get(
+                outcome_metric, 0.0), "action_taken": current_state.get(
+                "last_action", {
+                    "action_type": "no_action"}), **current_state}
+        return self.counterfactual_explanation(
+            observed_context=observed,
+            alternative_action=proposed_action,
+            outcome_name=outcome_metric
+        )
+    def discover_graph_from_memory(
+            self, data: pd.DataFrame, method: str = "pc") -> Dict[str, Any]:
+        return {"nodes": list(data.columns), "edges": []}
+    def fit_scm(
+            self,
+            data: pd.DataFrame,
+            treatment: str,
+            outcome: str,
+            graph: Optional[Dict] = None):
+        self.treatment = treatment
+        self.outcome = outcome
+    def estimate_effect(
+            self,
+            method_name: str = "backdoor.linear_regression") -> Optional[float]:
+        return None

app/core/__init__.py ADDED Viewed

File without changes

app/core/config.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from pydantic_settings import BaseSettings
+from typing import Optional
+class Settings(BaseSettings):
+    app_name: str = "ARF API Control Plane"
+    environment: str = "development"
+    database_url: Optional[str] = None
+    api_key: Optional[str] = None
+    RATE_LIMIT: str = "100/minute"  # default limit
+    # Usage tracker settings
+    ARF_USAGE_TRACKING: bool = False
+    ARF_USAGE_DB_PATH: str = "arf_usage.db"
+    ARF_REDIS_URL: Optional[str] = None
+    ARF_API_KEYS: str = "{}"  # JSON string of {key: tier}
+    class Config:
+        env_file = ".env"
+        extra = "ignore"
+settings = Settings()

app/core/storage.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # Simple in-memory list for incident history
2	+ incident_history = []

app/core/usage_tracker.py ADDED Viewed

	@@ -0,0 +1,347 @@

+"""
+Usage Tracker for ARF API – quotas, tiers, and audit logging.
+Non‑invasive, configurable, thread‑safe, and background‑task ready.
+"""
+import os
+import json
+import sqlite3
+import threading
+import time
+from contextlib import contextmanager
+from datetime import datetime, timedelta
+from typing import Dict, Any, Optional, List
+from enum import Enum
+from dataclasses import dataclass
+from fastapi import BackgroundTasks
+# Optional Redis support
+try:
+    import redis
+    REDIS_AVAILABLE = True
+except ImportError:
+    REDIS_AVAILABLE = False
+    redis = None
+class Tier(str, Enum):
+    FREE = "free"
+    PRO = "pro"
+    PREMIUM = "premium"
+    ENTERPRISE = "enterprise"
+    @property
+    def monthly_evaluation_limit(self) -> Optional[int]:
+        limits = {
+            Tier.FREE: 1000,
+            Tier.PRO: 10_000,
+            Tier.PREMIUM: 50_000,
+            Tier.ENTERPRISE: None,  # unlimited
+        }
+        return limits[self]
+    @property
+    def audit_log_retention_days(self) -> int:
+        retention = {
+            Tier.FREE: 7,
+            Tier.PRO: 30,
+            Tier.PREMIUM: 90,
+            Tier.ENTERPRISE: 365,
+        }
+        return retention[self]
+@dataclass
+class UsageRecord:
+    """Single evaluation usage record."""
+    api_key: str
+    tier: Tier
+    timestamp: float
+    endpoint: str
+    request_body: Optional[Dict[str, Any]] = None
+    response: Optional[Dict[str, Any]] = None
+    error: Optional[str] = None
+    processing_ms: Optional[float] = None
+class UsageTracker:
+    """
+    Thread‑safe usage tracker with SQLite storage and optional Redis for counters.
+    """
+    def __init__(self, db_path: str = "arf_usage.db", redis_url: Optional[str] = None):
+        self.db_path = db_path
+        self._local = threading.local()
+        self._init_db()
+        self._redis_client = None
+        if redis_url and REDIS_AVAILABLE:
+            self._redis_client = redis.from_url(redis_url)
+        elif redis_url:
+            raise ImportError("Redis client not installed. Run: pip install redis")
+    @contextmanager
+    def _get_conn(self):
+        """Get a thread‑local SQLite connection."""
+        if not hasattr(self._local, "conn"):
+            self._local.conn = sqlite3.connect(self.db_path, check_same_thread=False)
+            self._local.conn.row_factory = sqlite3.Row
+        yield self._local.conn
+    def _init_db(self):
+        with self._get_conn() as conn:
+            conn.execute("""
+                CREATE TABLE IF NOT EXISTS api_keys (
+                    key TEXT PRIMARY KEY,
+                    tier TEXT NOT NULL,
+                    created_at REAL NOT NULL,
+                    last_used_at REAL,
+                    is_active INTEGER DEFAULT 1
+                )
+            """)
+            conn.execute("""
+                CREATE TABLE IF NOT EXISTS usage_log (
+                    id INTEGER PRIMARY KEY AUTOINCREMENT,
+                    api_key TEXT NOT NULL,
+                    tier TEXT NOT NULL,
+                    timestamp REAL NOT NULL,
+                    endpoint TEXT NOT NULL,
+                    request_body TEXT,
+                    response TEXT,
+                    error TEXT,
+                    processing_ms REAL
+                )
+            """)
+            conn.execute("""
+                CREATE INDEX IF NOT EXISTS idx_api_key_timestamp
+                ON usage_log (api_key, timestamp)
+            """)
+            conn.execute("""
+                CREATE TABLE IF NOT EXISTS monthly_counts (
+                    api_key TEXT NOT NULL,
+                    year_month TEXT NOT NULL,
+                    count INTEGER DEFAULT 0,
+                    PRIMARY KEY (api_key, year_month)
+                )
+            """)
+            conn.commit()
+    def _get_month_key(self) -> str:
+        return datetime.now().strftime("%Y-%m")
+    def get_or_create_api_key(self, key: str, tier: Tier = Tier.FREE) -> bool:
+        """Register a new API key. Returns True if key exists or was created."""
+        with self._get_conn() as conn:
+            row = conn.execute("SELECT key FROM api_keys WHERE key = ?", (key,)).fetchone()
+            if row:
+                return True
+            conn.execute(
+                "INSERT INTO api_keys (key, tier, created_at, is_active) VALUES (?, ?, ?, ?)",
+                (key, tier.value, time.time(), 1)
+            )
+            conn.commit()
+            return True
+    def get_tier(self, api_key: str) -> Optional[Tier]:
+        """Return the tier for a given API key, or None if key invalid/inactive."""
+        with self._get_conn() as conn:
+            row = conn.execute(
+                "SELECT tier FROM api_keys WHERE key = ? AND is_active = 1",
+                (api_key,)
+            ).fetchone()
+            if not row:
+                return None
+            return Tier(row["tier"])
+    def update_api_key_tier(self, api_key: str, new_tier: Tier) -> bool:
+        """Update the tier of an existing API key. Returns True if successful."""
+        with self._get_conn() as conn:
+            row = conn.execute("SELECT key FROM api_keys WHERE key = ?", (api_key,)).fetchone()
+            if not row:
+                return False
+            conn.execute("UPDATE api_keys SET tier = ? WHERE key = ?", (new_tier.value, api_key))
+            conn.commit()
+            return True
+    def get_remaining_quota(self, api_key: str, tier: Tier) -> Optional[int]:
+        """Return remaining evaluations for the month, or None if unlimited."""
+        limit = tier.monthly_evaluation_limit
+        if limit is None:
+            return None
+        month = self._get_month_key()
+        if self._redis_client:
+            redis_key = f"arf:quota:{api_key}:{month}"
+            count = int(self._redis_client.get(redis_key) or 0)
+            return max(0, limit - count)
+        with self._get_conn() as conn:
+            row = conn.execute(
+                "SELECT count FROM monthly_counts WHERE api_key = ? AND year_month = ?",
+                (api_key, month)
+            ).fetchone()
+            count = row["count"] if row else 0
+            return max(0, limit - count)
+    def _increment_quota(self, api_key: str, tier: Tier) -> None:
+        """Increment the monthly counter (internal, synchronous)."""
+        limit = tier.monthly_evaluation_limit
+        if limit is None:
+            return
+        month = self._get_month_key()
+        if self._redis_client:
+            redis_key = f"arf:quota:{api_key}:{month}"
+            self._redis_client.incr(redis_key)
+            self._redis_client.expire(redis_key, timedelta(days=31))
+        else:
+            with self._get_conn() as conn:
+                conn.execute(
+                    """INSERT INTO monthly_counts (api_key, year_month, count)
+                       VALUES (?, ?, 1)
+                       ON CONFLICT(api_key, year_month) DO UPDATE SET count = count + 1""",
+                    (api_key, month)
+                )
+                conn.commit()
+    def _insert_audit_log(self, record: UsageRecord) -> None:
+        """Insert a single audit log (internal, synchronous)."""
+        with self._get_conn() as conn:
+            conn.execute(
+                """INSERT INTO usage_log
+                   (api_key, tier, timestamp, endpoint, request_body, response, error, processing_ms)
+                   VALUES (?, ?, ?, ?, ?, ?, ?, ?)""",
+                (
+                    record.api_key,
+                    record.tier.value,
+                    record.timestamp,
+                    record.endpoint,
+                    json.dumps(record.request_body) if record.request_body else None,
+                    json.dumps(record.response) if record.response else None,
+                    record.error,
+                    record.processing_ms,
+                )
+            )
+            conn.commit()
+    def increment_usage_sync(self, record: UsageRecord) -> bool:
+        """
+        Synchronously record usage and increment counter.
+        Returns True if within quota (i.e., counter was incremented), False if quota exceeded.
+        """
+        tier = record.tier
+        limit = tier.monthly_evaluation_limit
+        if limit is not None:
+            remaining = self.get_remaining_quota(record.api_key, tier)
+            if remaining <= 0:
+                return False
+        self._increment_quota(record.api_key, tier)
+        self._insert_audit_log(record)
+        return True
+    async def increment_usage_async(self, record: UsageRecord, background_tasks: BackgroundTasks) -> bool:
+        """
+        Asynchronously record usage using FastAPI BackgroundTasks.
+        Returns True if quota allows (i.e., will be recorded), False if quota exceeded.
+        """
+        tier = record.tier
+        limit = tier.monthly_evaluation_limit
+        if limit is not None:
+            remaining = self.get_remaining_quota(record.api_key, tier)
+            if remaining <= 0:
+                return False
+        # Schedule the actual write in the background
+        background_tasks.add_task(self._increment_quota, record.api_key, tier)
+        background_tasks.add_task(self._insert_audit_log, record)
+        return True
+    def get_audit_logs(
+        self,
+        api_key: str,
+        start_date: Optional[datetime] = None,
+        end_date: Optional[datetime] = None,
+        limit: int = 100,
+    ) -> List[Dict[str, Any]]:
+        """Retrieve audit logs for a given API key."""
+        query = "SELECT * FROM usage_log WHERE api_key = ?"
+        params = [api_key]
+        if start_date:
+            query += " AND timestamp >= ?"
+            params.append(start_date.timestamp())
+        if end_date:
+            query += " AND timestamp <= ?"
+            params.append(end_date.timestamp())
+        query += " ORDER BY timestamp DESC LIMIT ?"
+        params.append(limit)
+        with self._get_conn() as conn:
+            rows = conn.execute(query, params).fetchall()
+            return [dict(row) for row in rows]
+    def clean_old_logs(self):
+        """Delete logs older than retention period for each tier."""
+        with self._get_conn() as conn:
+            for tier in Tier:
+                retention_days = tier.audit_log_retention_days
+                if retention_days is None:
+                    continue
+                cutoff = time.time() - retention_days * 86400
+                conn.execute(
+                    "DELETE FROM usage_log WHERE tier = ? AND timestamp < ?",
+                    (tier.value, cutoff)
+                )
+            conn.commit()
+# Global instance
+tracker: Optional[UsageTracker] = None
+def init_tracker(db_path: str = "arf_usage.db", redis_url: Optional[str] = None):
+    global tracker
+    tracker = UsageTracker(db_path, redis_url)
+def update_key_tier(api_key: str, new_tier: Tier) -> bool:
+    """Globally accessible helper to update API key tier."""
+    if tracker is None:
+        return False
+    return tracker.update_api_key_tier(api_key, new_tier)
+# FastAPI dependency to enforce quota
+from fastapi import HTTPException, Request
+async def enforce_quota(request: Request, api_key: str = None):
+    """
+    Dependency that checks API key and remaining quota.
+    Use in your endpoint: `quota = Depends(enforce_quota)`
+    If usage tracking is disabled, returns a default dict (no enforcement).
+    """
+    # If tracker not initialised, allow all requests (fallback)
+    if tracker is None:
+        return {"api_key": api_key or "disabled", "tier": Tier.FREE, "remaining": None}
+    # Extract API key from header or query
+    if api_key is None:
+        auth_header = request.headers.get("Authorization")
+        if auth_header and auth_header.startswith("Bearer "):
+            api_key = auth_header[7:]
+        else:
+            api_key = request.query_params.get("api_key")
+    if not api_key:
+        raise HTTPException(status_code=401, detail="Missing API key")
+    tier = tracker.get_tier(api_key)
+    if tier is None:
+        raise HTTPException(status_code=403, detail="Invalid or inactive API key")
+    remaining = tracker.get_remaining_quota(api_key, tier)
+    if remaining is not None and remaining <= 0:
+        raise HTTPException(status_code=429, detail="Monthly evaluation quota exceeded")
+    # Store in request state for later logging
+    request.state.api_key = api_key
+    request.state.tier = tier
+    return {"api_key": api_key, "tier": tier, "remaining": remaining}

app/database/__init__.py ADDED Viewed

File without changes

app/database/base.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from sqlalchemy.ext.declarative import declarative_base
2	+ Base = declarative_base()

app/database/models_intents.py ADDED Viewed

	@@ -0,0 +1,33 @@

+from sqlalchemy import Column, Integer, String, DateTime, Boolean, Text, JSON, ForeignKey, UniqueConstraint
+from sqlalchemy.orm import relationship
+import datetime
+from .base import Base
+class IntentDB(Base):
+    __tablename__ = "intents"
+    id = Column(Integer, primary_key=True, index=True)
+    deterministic_id = Column(String(64), unique=True, index=True, nullable=False)
+    intent_type = Column(String(64), nullable=False)
+    payload = Column(JSON, nullable=False)
+    oss_payload = Column(JSON, nullable=True)
+    environment = Column(String(32), nullable=True)
+    created_at = Column(DateTime, default=datetime.datetime.utcnow, nullable=False)
+    evaluated_at = Column(DateTime, nullable=True)
+    risk_score = Column(String(32), nullable=True)
+    outcomes = relationship("OutcomeDB", back_populates="intent", cascade="all, delete-orphan")
+class OutcomeDB(Base):
+    __tablename__ = "intent_outcomes"
+    id = Column(Integer, primary_key=True, index=True)
+    intent_id = Column(Integer, ForeignKey("intents.id", ondelete="CASCADE"), nullable=False)
+    success = Column(Boolean, nullable=False)
+    recorded_by = Column(String(128), nullable=True)
+    notes = Column(Text, nullable=True)
+    recorded_at = Column(DateTime, default=datetime.datetime.utcnow, nullable=False)
+    intent = relationship("IntentDB", back_populates="outcomes")
+    __table_args__ = (
+        UniqueConstraint("intent_id", name="uq_outcome_intentid"),
+    )

app/database/session.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from sqlalchemy import create_engine
+from sqlalchemy.orm import sessionmaker
+from app.core.config import settings
+engine = create_engine(settings.database_url)
+SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)

app/main.py ADDED Viewed

	@@ -0,0 +1,257 @@

+"""
+ARF API Control Plane - Main Application Entry Point
+With optional heavy dependencies and usage tracking.
+"""
+import logging
+import os
+import sys
+import json
+from contextlib import asynccontextmanager
+from typing import Dict
+from fastapi import FastAPI
+from fastapi.middleware.cors import CORSMiddleware
+# Optional prometheus
+try:
+    from prometheus_fastapi_instrumentator import Instrumentator
+    PROMETHEUS_AVAILABLE = True
+except ImportError:
+    PROMETHEUS_AVAILABLE = False
+    Instrumentator = None
+# Optional slowapi
+try:
+    from slowapi import _rate_limit_exceeded_handler
+    from slowapi.errors import RateLimitExceeded
+    from slowapi.middleware import SlowAPIMiddleware
+    SLOWAPI_AVAILABLE = True
+except ImportError:
+    SLOWAPI_AVAILABLE = False
+    _rate_limit_exceeded_handler = None
+    RateLimitExceeded = None
+    SlowAPIMiddleware = None
+# Optional agentic_reliability_framework (risk engine, policy engine, etc.)
+try:
+    from agentic_reliability_framework.core.governance.risk_engine import RiskEngine
+    from agentic_reliability_framework.core.governance.policy_engine import PolicyEngine
+    from agentic_reliability_framework.runtime.memory import create_faiss_index, RAGGraphMemory
+    from agentic_reliability_framework.runtime.memory.constants import MemoryConstants
+    ARF_AVAILABLE = True
+except ImportError:
+    ARF_AVAILABLE = False
+    RiskEngine = None
+    PolicyEngine = None
+    create_faiss_index = None
+    RAGGraphMemory = None
+    MemoryConstants = None
+# ===== USAGE TRACKER =====
+from app.core.usage_tracker import init_tracker, tracker, Tier
+from app.api import (
+    routes_governance,
+    routes_history,
+    routes_incidents,
+    routes_intents,
+    routes_risk,
+    routes_memory,
+    routes_admin,
+    routes_payments,
+    webhooks,
+    routes_users,          # <-- ADDED
+)
+from app.api.deps import limiter
+from app.core.config import settings
+logger = logging.getLogger("arf.api")
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+    handlers=[logging.StreamHandler(sys.stdout)],
+)
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    logger.info("🚀 Starting ARF API Control Plane")
+    logger.debug(f"Python path: {sys.path}")
+    if ARF_AVAILABLE:
+        hmc_model_path = os.getenv("ARF_HMC_MODEL", "models/hmc_model.json")
+        use_hyperpriors = os.getenv(
+            "ARF_USE_HYPERPRIORS",
+            "false").lower() == "true"
+        logger.info(
+            "Initializing RiskEngine – HMC model: %s, hyperpriors: %s",
+            hmc_model_path,
+            use_hyperpriors)
+        try:
+            app.state.risk_engine = RiskEngine(
+                hmc_model_path=hmc_model_path,
+                use_hyperpriors=use_hyperpriors,
+                n0=1000,
+                hyperprior_weight=0.3,
+            )
+            logger.info("✅ RiskEngine initialized successfully.")
+        except Exception as e:
+            logger.exception("💥 Fatal error initializing RiskEngine")
+            raise RuntimeError("RiskEngine initialization failed") from e
+        try:
+            app.state.policy_engine = PolicyEngine()
+            logger.info("✅ PolicyEngine initialized successfully.")
+        except Exception as e:
+            logger.warning(f"PolicyEngine initialization failed: {e}")
+            app.state.policy_engine = None
+        try:
+            faiss_index = create_faiss_index(dim=MemoryConstants.VECTOR_DIM)
+            app.state.rag_graph = RAGGraphMemory(faiss_index)
+            logger.info("✅ RAGGraphMemory initialized successfully.")
+        except Exception as e:
+            logger.warning(f"RAGGraphMemory initialization failed: {e}")
+            app.state.rag_graph = None
+        epistemic_model_name = os.getenv("EPISTEMIC_MODEL", "")
+        if epistemic_model_name:
+            try:
+                from sentence_transformers import SentenceTransformer
+                logger.info(f"Loading epistemic model: {epistemic_model_name}")
+                app.state.epistemic_model = SentenceTransformer(
+                    epistemic_model_name)
+                app.state.epistemic_tokenizer = app.state.epistemic_model.tokenizer
+                logger.info("✅ Epistemic model loaded.")
+            except ImportError:
+                logger.warning(
+                    "sentence-transformers not installed; epistemic signals will be zeros.")
+                app.state.epistemic_model = None
+                app.state.epistemic_tokenizer = None
+            except Exception as e:
+                logger.warning(f"Failed to load epistemic model: {e}")
+                app.state.epistemic_model = None
+                app.state.epistemic_tokenizer = None
+        else:
+            logger.info(
+                "EPISTEMIC_MODEL not set; epistemic signals will be zeros.")
+            app.state.epistemic_model = None
+            app.state.epistemic_tokenizer = None
+    else:
+        logger.warning(
+            "agentic_reliability_framework not installed; risk engine, policy engine, RAG disabled.")
+    # ===== USAGE TRACKER INITIALISATION =====
+    if os.getenv("ARF_USAGE_TRACKING", "false").lower() == "true":
+        logger.info("Initialising usage tracker...")
+        init_tracker(
+            db_path=os.getenv("ARF_USAGE_DB_PATH", "arf_usage.db"),
+            redis_url=os.getenv("ARF_REDIS_URL")
+        )
+        # Seed initial API keys from environment variable (for testing / demo)
+        api_keys_json = os.getenv("ARF_API_KEYS", "{}")
+        try:
+            api_keys = json.loads(api_keys_json)
+            for key, tier_str in api_keys.items():
+                try:
+                    tier = Tier(tier_str.lower())
+                    tracker.get_or_create_api_key(key, tier)
+                    logger.info(f"Seeded API key for tier {tier.value}")
+                except ValueError:
+                    logger.warning(f"Invalid tier '{tier_str}' for key {key}, skipping")
+        except json.JSONDecodeError:
+            logger.warning("ARF_API_KEYS environment variable is not valid JSON; skipping seeding.")
+        app.state.usage_tracker = tracker
+        logger.info("✅ Usage tracker ready.")
+    else:
+        logger.info("Usage tracking disabled (ARF_USAGE_TRACKING not set to true).")
+        app.state.usage_tracker = None
+    yield
+    logger.info("🛑 Shutting down ARF API")
+def create_app() -> FastAPI:
+    app = FastAPI(
+        title=settings.app_name,
+        version="0.5.0",
+        lifespan=lifespan,
+        docs_url="/docs",
+        redoc_url="/redoc",
+        description="Agentic Reliability Framework (ARF) API",
+    )
+    allowed_origins = ["https://arf-frontend-sandy.vercel.app"]
+    app.add_middleware(
+        CORSMiddleware,
+        allow_origins=allowed_origins,
+        allow_credentials=True,
+        allow_methods=["*"],
+        allow_headers=["*"],
+    )
+    logger.debug("CORS middleware configured")
+    if SLOWAPI_AVAILABLE:
+        app.state.limiter = limiter
+        app.add_exception_handler(
+            RateLimitExceeded,
+            _rate_limit_exceeded_handler)
+        app.add_middleware(SlowAPIMiddleware)
+        logger.debug("Rate limiter middleware configured")
+    else:
+        logger.debug("Rate limiter disabled (slowapi not installed)")
+    if PROMETHEUS_AVAILABLE:
+        Instrumentator().instrument(app).expose(app)
+        logger.debug("Prometheus instrumentator configured")
+    else:
+        logger.debug(
+            "Prometheus instrumentator disabled (module not installed)")
+    # Include routers
+    app.include_router(
+        routes_incidents.router,
+        prefix="/api/v1",
+        tags=["incidents"])
+    app.include_router(routes_risk.router, prefix="/api/v1", tags=["risk"])
+    app.include_router(
+        routes_intents.router,
+        prefix="/api/v1",
+        tags=["intents"])
+    app.include_router(
+        routes_history.router,
+        prefix="/api/v1",
+        tags=["history"])
+    app.include_router(
+        routes_governance.router,
+        prefix="/api/v1",
+        tags=["governance"])
+    app.include_router(
+        routes_memory.router,
+        prefix="/v1/memory",
+        tags=["memory"])
+    app.include_router(
+        routes_admin.router,
+        prefix="/api/v1",
+        tags=["admin"])
+    app.include_router(
+        routes_payments.router,
+        prefix="/api/v1",
+        tags=["payments"])
+    app.include_router(
+        webhooks.router,
+        tags=["webhooks"])
+    app.include_router(
+        routes_users.router,          # <-- ADDED
+        prefix="/api/v1",
+        tags=["users"])
+    logger.debug("All API routers included")
+    @app.get("/health", tags=["health"])
+    async def health() -> Dict[str, str]:
+        return {"status": "ok"}
+    return app
+app = create_app()

app/services/__init__.py ADDED Viewed

File without changes

app/services/incident_service.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from agentic_reliability_framework.core.reliability_signal import signal_to_reliability
+from app.models.incident_models import IncidentReport
+def process_incident(report: IncidentReport) -> float:
+    reliability = signal_to_reliability(report.value, signal_type=report.signal_type)
+    return reliability

app/services/intent_adapter.py ADDED Viewed

	@@ -0,0 +1,39 @@

+from agentic_reliability_framework.core.governance.intents import (
+    ProvisionResourceIntent,
+    GrantAccessIntent,
+    DeployConfigurationIntent,
+)
+def to_oss_intent(api_request):
+    if api_request.intent_type == "provision_resource":
+        return ProvisionResourceIntent(
+            resource_type=api_request.resource_type,
+            region=api_request.region,
+            size=api_request.size,
+            configuration=api_request.configuration,
+            environment=api_request.environment,
+            requester=api_request.requester,
+            provenance=api_request.provenance,
+        )
+    elif api_request.intent_type == "grant_access":
+        return GrantAccessIntent(
+            principal=api_request.principal,
+            permission_level=api_request.permission_level,
+            resource_scope=api_request.resource_scope,
+            justification=api_request.justification,
+            requester=api_request.requester,
+            provenance=api_request.provenance,
+        )
+    elif api_request.intent_type == "deploy_config":
+        return DeployConfigurationIntent(
+            service_name=api_request.service_name,
+            change_scope=api_request.change_scope,
+            deployment_target=api_request.deployment_target,
+            risk_level_hint=api_request.risk_level_hint,
+            configuration=api_request.configuration,
+            requester=api_request.requester,
+            provenance=api_request.provenance,
+        )
+    else:
+        raise ValueError(f"Unknown intent type: {api_request.intent_type}")

app/services/intent_service.py ADDED Viewed

	@@ -0,0 +1,19 @@

+import random
+import logging
+from app.models.intent_models import IntentSimulation
+logger = logging.getLogger(__name__)
+# Note: This endpoint is deprecated. Use /v1/intents/evaluate instead.
+def simulate_intent(intent: IntentSimulation) -> dict:
+    logger.warning("Deprecated endpoint /simulate_intent used. Please migrate to /v1/intents/evaluate.")
+    # For backward compatibility, we still use random risk.
+    risk_score = random.uniform(0, 1)
+    if risk_score < 0.2:
+        recommendation = "safe_to_execute"
+    elif risk_score < 0.6:
+        recommendation = "requires_approval"
+    else:
+        recommendation = "blocked"
+    return {"risk_score": risk_score, "recommendation": recommendation}

app/services/intent_store.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import datetime
+from sqlalchemy.orm import Session
+from app.database.models_intents import IntentDB
+from typing import Any, Dict, Optional
+def save_evaluated_intent(
+    db: Session,
+    deterministic_id: str,
+    intent_type: str,
+    api_payload: Dict[str, Any],
+    oss_payload: Dict[str, Any],
+    environment: str,
+    risk_score: float
+) -> IntentDB:
+    existing = db.query(IntentDB).filter(IntentDB.deterministic_id == deterministic_id).one_or_none()
+    if existing:
+        existing.evaluated_at = datetime.datetime.utcnow()
+        existing.risk_score = str(risk_score)
+        existing.oss_payload = oss_payload
+        db.add(existing)
+        db.commit()
+        db.refresh(existing)
+        return existing
+    intent = IntentDB(
+        deterministic_id=deterministic_id,
+        intent_type=intent_type,
+        payload=api_payload,
+        oss_payload=oss_payload,
+        environment=environment,
+        evaluated_at=datetime.datetime.utcnow(),
+        risk_score=str(risk_score)
+    )
+    db.add(intent)
+    db.commit()
+    db.refresh(intent)
+    return intent
+def get_intent_by_deterministic_id(db: Session, deterministic_id: str) -> Optional[IntentDB]:
+    return db.query(IntentDB).filter(IntentDB.deterministic_id == deterministic_id).one_or_none()

app/services/outcome_service.py ADDED Viewed

	@@ -0,0 +1,116 @@

+import datetime
+import logging
+from typing import Optional, Dict, Any
+from sqlalchemy.orm import Session
+from agentic_reliability_framework.core.governance.risk_engine import RiskEngine
+from agentic_reliability_framework.core.governance.intents import (
+    InfrastructureIntent,
+    ProvisionResourceIntent,
+    GrantAccessIntent,
+    DeployConfigurationIntent,
+)
+from app.database.models_intents import IntentDB, OutcomeDB
+logger = logging.getLogger(__name__)
+class OutcomeConflictError(Exception):
+    pass
+def reconstruct_oss_intent_from_json(oss_json: Dict[str, Any]) -> InfrastructureIntent:
+    intent_type = oss_json.get("intent_type")
+    if intent_type == "provision_resource":
+        return ProvisionResourceIntent(**oss_json)
+    elif intent_type == "grant_access":
+        return GrantAccessIntent(**oss_json)
+    elif intent_type == "deploy_config":
+        return DeployConfigurationIntent(**oss_json)
+    else:
+        raise ValueError(
+            f"Cannot reconstruct intent from JSON: missing or unknown intent_type {intent_type}"
+        )
+def _create_dummy_intent(intent_type: str) -> Optional[InfrastructureIntent]:
+    """Create a valid dummy intent for a given intent type.
+    For now, only ProvisionResourceIntent is fully supported.
+    """
+    from agentic_reliability_framework.core.governance.intents import (
+        ProvisionResourceIntent,
+    )
+    if intent_type == "ProvisionResourceIntent":
+        # Use string values directly; they must be valid according to the model
+        return ProvisionResourceIntent(
+            resource_type="vm",
+            region="eastus",
+            size="Standard_D2s_v3",
+            environment="dev",          # Use string instead of Environment.dev
+            requester="system"
+        )
+    else:
+        logger.warning("Dummy intent creation not implemented for %s", intent_type)
+        return None
+def record_outcome(
+    db: Session,
+    deterministic_id: str,
+    success: bool,
+    recorded_by: Optional[str],
+    notes: Optional[str],
+    risk_engine: RiskEngine
+) -> OutcomeDB:
+    intent = db.query(IntentDB).filter(IntentDB.deterministic_id == deterministic_id).one_or_none()
+    if not intent:
+        raise ValueError(f"Intent not found: {deterministic_id}")
+    existing_outcome = db.query(OutcomeDB).filter(OutcomeDB.intent_id == intent.id).one_or_none()
+    if existing_outcome:
+        if existing_outcome.success == success:
+            return existing_outcome
+        raise OutcomeConflictError("Outcome already recorded with different result")
+    outcome = OutcomeDB(
+        intent_id=intent.id,
+        success=bool(success),
+        recorded_by=recorded_by,
+        notes=notes,
+        recorded_at=datetime.datetime.utcnow()  # will be replaced with timezone-aware later
+    )
+    db.add(outcome)
+    db.commit()
+    db.refresh(outcome)
+    # Determine OSS intent for risk engine update
+    oss_intent = None
+    if intent.oss_payload:
+        try:
+            oss_intent = reconstruct_oss_intent_from_json(intent.oss_payload)
+        except Exception as e:
+            logger.warning(
+                "Failed to reconstruct OSS intent for %s: %s. Using dummy fallback.",
+                deterministic_id, e
+            )
+            oss_intent = _create_dummy_intent(intent.intent_type)
+    else:
+        oss_intent = _create_dummy_intent(intent.intent_type)
+    # Update risk engine if we have an intent
+    if oss_intent is not None:
+        try:
+            risk_engine.update_outcome(oss_intent, success)
+        except Exception as e:
+            logger.exception(
+                "Failed to update RiskEngine after recording outcome for intent %s: %s",
+                deterministic_id, e
+            )
+    else:
+        logger.error(
+            "No valid OSS intent available for risk engine update; skipping outcome for %s",
+            deterministic_id
+        )
+    return outcome

app/services/risk_service.py ADDED Viewed

	@@ -0,0 +1,158 @@

+from agentic_reliability_framework.core.governance.risk_engine import RiskEngine
+from agentic_reliability_framework.core.governance.intents import InfrastructureIntent
+from typing import Optional, List, Dict, Any
+from agentic_reliability_framework.core.models.event import ReliabilityEvent, HealingAction
+from agentic_reliability_framework.core.governance.policy_engine import PolicyEngine
+from agentic_reliability_framework.core.decision.decision_engine import DecisionEngine
+from agentic_reliability_framework.runtime.memory.rag_graph import RAGGraphMemory
+# NEW: Import eclipse probe
+from agentic_reliability_framework.core.research.eclipse_probe import compute_epistemic_risk
+def evaluate_intent(
+    engine: RiskEngine,
+    intent: InfrastructureIntent,
+    cost_estimate: Optional[float],
+    policy_violations: List[str]
+) -> dict:
+    """
+    Evaluate an infrastructure intent using the Bayesian risk engine.
+    Returns a dictionary with risk score, explanation, and contributions.
+    """
+    score, explanation, contributions = engine.calculate_risk(
+        intent=intent,
+        cost_estimate=cost_estimate,
+        policy_violations=policy_violations
+    )
+    return {
+        "risk_score": score,
+        "explanation": explanation,
+        "contributions": contributions
+    }
+def evaluate_healing_decision(
+    event: ReliabilityEvent,
+    policy_engine: PolicyEngine,
+    decision_engine: Optional[DecisionEngine] = None,
+    rag_graph: Optional[RAGGraphMemory] = None,
+    model=None,          # NEW: optional HuggingFace model
+    tokenizer=None,      # NEW: optional tokenizer
+) -> Dict[str, Any]:
+    """
+    Evaluate healing actions for a given reliability event using decision‑theoretic selection.
+    Now includes epistemic risk signals from the eclipse probe.
+    Returns:
+        Dictionary with keys: risk_score, selected_action, expected_utility, alternatives,
+        explanation, epistemic_signals (new).
+    """
+    # If decision_engine not provided, try to get from policy_engine
+    if decision_engine is None and hasattr(policy_engine, 'decision_engine'):
+        decision_engine = policy_engine.decision_engine
+    # If still None, create a minimal one (global stats only)
+    if decision_engine is None:
+        decision_engine = DecisionEngine(rag_graph=rag_graph)
+    # Get raw candidate actions (by temporarily disabling decision engine)
+    orig_use = policy_engine.use_decision_engine
+    try:
+        policy_engine.use_decision_engine = False
+        raw_actions = policy_engine.evaluate_policies(event)
+    finally:
+        policy_engine.use_decision_engine = orig_use
+    # If no actions, return NO_ACTION
+    if not raw_actions or raw_actions == [HealingAction.NO_ACTION]:
+        return {
+            "risk_score": 0.0,
+            "selected_action": HealingAction.NO_ACTION.value,
+            "expected_utility": 0.0,
+            "alternatives": [],
+            "explanation": "No candidate actions triggered.",
+            "epistemic_signals": None,
+        }
+    # === NEW: Compute epistemic signals from triggered policies ===
+    # Build reasoning text from the policies that triggered the actions
+    reasoning_parts = []
+    for policy in policy_engine.policies:
+        # Check if any of the policy's actions are in raw_actions
+        if any(a in policy.actions for a in raw_actions):
+            conditions_str = ", ".join(
+                f"{c.metric} {c.operator} {c.threshold}" for c in policy.conditions
+            )
+            reasoning_parts.append(
+                f"Policy {policy.name} triggered by {conditions_str} → actions {[a.value for a in policy.actions]}"
+            )
+    reasoning_text = " ".join(reasoning_parts)
+    # Build evidence text from the event
+    evidence_text = (
+        f"Component: {event.component}, "
+        f"latency_p99: {event.latency_p99}, "
+        f"error_rate: {event.error_rate}, "
+        f"cpu_util: {event.cpu_util}, "
+        f"memory_util: {event.memory_util}"
+    )
+    # Compute epistemic signals (if model/tokenizer provided)
+    epistemic_signals = None
+    if model is not None and tokenizer is not None:
+        epistemic_signals = compute_epistemic_risk(
+            reasoning_text, evidence_text, model, tokenizer
+        )
+    else:
+        # In OSS, we may not have model; use zeros as fallback
+        epistemic_signals = {
+            "entropy": 0.0,
+            "contradiction": 0.0,
+            "evidence_lift": 0.0,
+            "hallucination_risk": 0.0,
+        }
+    # Run decision engine to get best action and alternatives, passing epistemic signals
+    decision = decision_engine.select_optimal_action(
+        raw_actions, event, component=event.component,
+        epistemic_signals=epistemic_signals
+    )
+    # Risk of the selected action
+    risk_score = None
+    for alt in decision.alternatives:
+        if alt.action == decision.best_action:
+            risk_score = alt.risk
+            break
+    if risk_score is None:
+        # Compute risk separately
+        risk_score = decision_engine.compute_risk(decision.best_action, event, event.component)
+    # Format alternatives (top 3 only)
+    alt_list = []
+    for alt in decision.alternatives[:3]:
+        alt_list.append({
+            "action": alt.action.value,
+            "expected_utility": alt.utility,
+            "risk": alt.risk,
+        })
+    # Build final response
+    response = {
+        "risk_score": risk_score,
+        "selected_action": decision.best_action.value,
+        "expected_utility": decision.expected_utility,
+        "alternatives": alt_list,
+        "explanation": decision.explanation,
+        "raw_decision": decision.raw_data,
+        "epistemic_signals": epistemic_signals,   # NEW
+    }
+    return response
+def get_system_risk() -> float:
+    # Placeholder – this endpoint is being deprecated; we keep it for backward compatibility.
+    import random
+    return round(random.uniform(0, 1), 2)