petter2025 commited on
Commit
6d20eab
·
1 Parent(s): ec3f480

Upload folder using huggingface_hub (#3)

Browse files

- Upload folder using huggingface_hub (afa4de7853bd41745e14f4126ce2a1f5b8beccfa)

This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .dockerignore +8 -0
  2. .gitignore +16 -33
  3. Dockerfile +2 -1
  4. README.md +90 -72
  5. alembic/versions/d36deffe7fa2_add_beta_state_table_for_conjugate_.py +47 -0
  6. app/api/deps.py +45 -63
  7. app/api/routes_admin.py +36 -25
  8. app/api/routes_governance.py +190 -71
  9. app/api/routes_incidents.py +186 -54
  10. app/api/routes_memory.py +5 -1
  11. app/api/routes_payments.py +7 -5
  12. app/api/routes_pricing.py +104 -0
  13. app/api/routes_risk.py +16 -19
  14. app/api/routes_users.py +7 -19
  15. app/api/webhooks.py +2 -1
  16. app/core/config.py +3 -0
  17. app/core/usage_tracker.py +257 -93
  18. app/database/models_intents.py +48 -6
  19. app/database/session.py +1 -14
  20. app/main.py +207 -67
  21. app/models/__init__.py +1 -1
  22. app/models/incident_models.py +3 -2
  23. app/models/infrastructure_intents.py +7 -40
  24. app/models/intent_models.py +1 -1
  25. app/models/risk_models.py +1 -1
  26. app/services/incident_service.py +2 -1
  27. app/services/intent_adapter.py +162 -65
  28. app/services/intent_service.py +2 -1
  29. app/services/intent_store.py +7 -3
  30. app/services/outcome_service.py +117 -57
  31. app/services/risk_service.py +348 -69
  32. app/services/wilson_monitor.py +56 -0
  33. docker-compose.test.yml +12 -0
  34. docs/authentication.md +25 -0
  35. docs/development.md +55 -0
  36. docs/docs_endpoints.md +314 -0
  37. docs/endpoints.md +34 -0
  38. docs/examples.md +54 -0
  39. docs/index.md +16 -0
  40. monitor.sh +18 -0
  41. render.yaml +19 -0
  42. requirements-dev.txt +3 -0
  43. requirements.txt +9 -5
  44. runtime.txt +2 -0
  45. seed_rag_data.py +67 -0
  46. start.sh +68 -0
  47. tests/conftest.py +128 -0
  48. tests/test_deps.py +15 -0
  49. tests/test_governance.py +71 -0
  50. tests/test_healing_endpoint.py +21 -0
.dockerignore ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ .git
2
+ __pycache__
3
+ *.pyc
4
+ .env
5
+ venv
6
+ .pytest_cache
7
+ .coverage
8
+ htmlcov
.gitignore CHANGED
@@ -1,50 +1,33 @@
1
  # Python
2
  __pycache__/
3
- *.py[cod]
4
- *$py.class
5
- *.so
6
  .Python
7
- build/
8
- develop-eggs/
9
- dist/
10
- downloads/
11
- eggs/
12
- .eggs/
13
- lib/
14
- lib64/
15
- parts/
16
- sdist/
17
- var/
18
- wheels/
19
- *.egg-info/
20
- .installed.cfg
21
- *.egg
22
 
23
- # Virtual Environment
24
  venv/
25
  env/
26
  ENV/
27
- .env/
28
  .venv/
29
 
 
 
 
 
 
30
  # IDE
31
  .vscode/
32
  .idea/
33
  *.swp
34
  *.swo
35
- *~
36
 
37
  # OS
38
  .DS_Store
39
- .DS_Store?
40
- ._*
41
- .Spotlight-V100
42
- .Trashes
43
- ehthumbs.db
44
- Thumbs.db
45
-
46
- # Hugging Face Spaces
47
- data/
48
- models/
49
- logs/
50
- *.log
 
1
  # Python
2
  __pycache__/
3
+ *.pyc
4
+ *.pyo
5
+ *.pyd
6
  .Python
7
+ *.so
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
+ # Virtual environments
10
  venv/
11
  env/
12
  ENV/
 
13
  .venv/
14
 
15
+ # Build artifacts
16
+ dist/
17
+ build/
18
+ *.egg-info/
19
+
20
  # IDE
21
  .vscode/
22
  .idea/
23
  *.swp
24
  *.swo
 
25
 
26
  # OS
27
  .DS_Store
28
+ .env
29
+ test.db
30
+ venv
31
+ .coverage
32
+ monitor.log
33
+ monitor_loop.log
 
 
 
 
 
 
Dockerfile CHANGED
@@ -1,6 +1,7 @@
1
  FROM python:3.12-slim
 
2
  WORKDIR /app
3
  COPY requirements.txt .
4
  RUN pip install --no-cache-dir -r requirements.txt
5
  COPY . .
6
- CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]
 
1
  FROM python:3.12-slim
2
+ RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
3
  WORKDIR /app
4
  COPY requirements.txt .
5
  RUN pip install --no-cache-dir -r requirements.txt
6
  COPY . .
7
+ CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]
README.md CHANGED
@@ -1,103 +1,121 @@
1
- ---
2
- title: Agentic Reliability Framework (ARF) v4 – Public API Demo
3
- emoji: 🤖
4
- colorFrom: blue
5
- colorTo: green
6
- sdk: docker
7
- python_version: '3.10'
8
- app_file: app.py
9
- pinned: false
10
- ---
11
 
12
- # Agentic Reliability Framework (ARF) – Public API Demo (Sandbox)
13
 
14
- **Problem:** Most AI‑driven governance systems fail silently in production, leading to outages, security breaches, and compliance violations.
15
 
16
- **Solution:** ARF turns probabilistic AI into deterministic, auditable action using Bayesian inference, semantic memory, and **expected loss minimisation**.
 
 
17
 
18
- **Outcome:** Reduce MTTR by up to 85% with self‑healing systems, backed by fully explainable risk scores.
19
 
20
- > ℹ️ **This Space provides a sanitised, mock API endpoint.** The real ARF core engine is proprietary, access‑controlled, and available only to qualified pilots and enterprise customers. See the [public specification](https://arf-foundation.github.io/arf-spec/) for details.
 
 
 
21
 
22
- ---
23
 
24
- ## 🚀 Start Here
25
 
26
- | | |
27
- |--|--|
28
- | **📚 API Docs** | [https://a-r-f-arf-sandbox-api.hf.space/docs](https://a-r-f-arf-sandbox-api.hf.space/docs) |
29
- | **🧪 Live Demo** | [Gradio Dashboard](https://a-r-f-arf-sandbox-api.hf.space/) |
30
- | **📦 Public Spec** | [github.com/arf-foundation/arf-spec](https://github.com/arf-foundation/arf-spec) |
31
- | **📅 Book a Call** | [Calendly](https://calendly.com/petter2025us/30min) |
32
 
33
- ---
34
 
35
- ## 🔍 Quick Example
 
36
 
37
- ```python
38
- import requests
39
 
40
- response = requests.post(
41
- "https://a-r-f-arf-sandbox-api.hf.space/v1/evaluate",
42
- json={
43
- "service_name": "payment-gateway",
44
- "event_type": "latency_spike",
45
- "severity": "high",
46
- "metrics": {"latency_p99": 450, "error_rate": 0.12}
47
- }
48
- )
49
- print(response.json())
50
  ```
51
 
52
- The response includes a mock HealingIntent with:
 
 
 
 
53
 
54
- * risk\_score: simulated failure probability
55
-
56
- * risk\_factors: additive contributions from conjugate prior, hyperprior, and HMC
57
-
58
- * recommended\_action: approve, deny, or escalate
59
-
60
- * decision\_trace: expected losses and variance
61
-
62
 
63
- ⚠️ **All responses from this endpoint are simulated.** The real Bayesian engine is not exposed publicly.
64
 
65
- 🧠 Key Capabilities (Conceptual Overview)
66
- -----------------------------------------
67
 
68
- * **Bayesian Risk Scoring** – Conjugate priors + HMC for calibrated uncertainty.
69
-
70
- * **Semantic Memory** – FAISS‑based retrieval of similar past incidents.
71
-
72
- * **Expected Loss Minimisation** – Chooses approve/deny/escalate by minimising cost-weighted risk, not static thresholds.
73
-
74
- * **Multi‑Agent Orchestration** – Anomaly detection, root cause, forecasting.
75
-
76
 
77
- 📊 Architecture
78
- ---------------
79
 
80
  ```text
81
- User Request Policy Evaluation Cost Estimation → Risk Scoring
82
-
83
- HealingIntent ← Decision (Expected Loss)
84
  ```
85
 
86
- All decisions are immutable, signed, and fully traceable via ancestor\_chain and infrastructure\_intent fields.
 
 
 
 
87
 
88
- 🔧 Local Development
89
- --------------------
90
 
91
  ```bash
92
- docker build -t arf-api .
93
- docker run -p 7860:7860 arf-api
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  ```
95
 
96
- Then open [http://localhost:7860](http://localhost:7860/) for the Gradio UI and [http://localhost:7860/api/docs](http://localhost:7860/api/docs) for the API.
 
 
 
 
 
 
 
 
 
 
97
 
98
- 📚 About ARF
99
- ------------
100
 
101
- The **Agentic Reliability Framework** is a governed, mathematically grounded advisory layer for AI infrastructure. The public specification, demo UI, and sandbox API are open‑source (Apache 2.0). **The core Bayesian engine is proprietary and access‑controlled** — available for pilot evaluation and enterprise licensing under outcome‑based pricing.
 
102
 
103
- Learn more at [github.com/arf-foundation](https://github.com/arf-foundation) and request access via petter2025us@outlook.com.
 
1
+ # arf-api
 
 
 
 
 
 
 
 
 
2
 
3
+ ARF API Control Plane (FastAPI)
4
 
5
+ ## Live Demo
6
 
7
+ The API is deployed and accessible at:
8
+ - **Base URL**: [https://a-r-f-agentic-reliability-framework-api.hf.space](https://a-r-f-agentic-reliability-framework-api.hf.space)
9
+ - **Interactive Documentation**: [https://a-r-f-agentic-reliability-framework-api.hf.space/docs](https://a-r-f-agentic-reliability-framework-api.hf.space/docs)
10
 
11
+ ## Quick Start (Local Development)
12
 
13
+ 1. **Install dependencies**:
14
+ ```bash
15
+ pip install -r requirements.txt
16
+ ```
17
 
18
+ Note: `requirements.txt` installs `agentic-reliability-framework` directly from the project's Git repository.
19
 
20
+ 2. **Set environment variables** (optional, in `.env`):
21
 
22
+ ```text
23
+ ARF_HMC_MODEL – path to HMC model JSON (default: models/hmc_model.json)
 
 
 
 
24
 
25
+ ARF_USE_HYPERPRIORS – true/false
26
 
27
+ API_KEY optional (currently not enforced)
28
+ ```
29
 
30
+ 3. **Run the app locally**:
 
31
 
32
+ ```bash
33
+ uvicorn app.main:app --reload --port 8000
 
 
 
 
 
 
 
 
34
  ```
35
 
36
+ 4. **Health check**:
37
+
38
+ ```bash
39
+ GET http://localhost:8000/health
40
+ ```
41
 
42
+ ## Causal Explainer Endpoint
 
 
 
 
 
 
 
43
 
44
+ The ARF API includes a heuristic causal explainer that evaluates the impact of proposed healing actions using deterministic rules. This module provides counterfactual reasoning without requiring a fitted causal model or external ML dependencies.
45
 
46
+ The explainer estimates how system metrics such as latency would change if a different action were taken.
 
47
 
48
+ ### Mathematical Model
 
 
 
 
 
 
 
49
 
50
+ The counterfactual outcome is computed as:
 
51
 
52
  ```text
53
+ counterfactual_outcome = factual_outcome * (1 + effect_frac)
 
 
54
  ```
55
 
56
+ Where:
57
+
58
+ - `effect_frac` is a predefined impact factor based on the action type
59
+ - effects are multiplicative
60
+ - a fixed ±10% uncertainty interval is applied to the estimated outcome
61
 
62
+ ### Example Request
 
63
 
64
  ```bash
65
+ curl -X POST "http://localhost:8000/api/v1/v1/incidents/evaluate" -H "Content-Type: application/json" -d '{
66
+ "component": "checkout-service",
67
+ "latency_p99": 600,
68
+ "error_rate": 0.2,
69
+ "service_mesh": "default"
70
+ }'
71
+ ```
72
+
73
+ ### Example Response
74
+
75
+ ```json
76
+ {
77
+ "healing_intent": {
78
+ "action": "restart_container",
79
+ "component": "checkout-service",
80
+ "parameters": {},
81
+ "justification": "Causal: If we apply restart_container instead of no_action, latency would change from 600.00 to 510.00 (Δ = -90.00). Based on heuristic causal model.",
82
+ "confidence": 0.85,
83
+ "risk_score": 0.54,
84
+ "status": "oss_advisory_only"
85
+ },
86
+ "causal_explanation": {
87
+ "factual_outcome": 600,
88
+ "counterfactual_outcome": 510,
89
+ "effect": -90,
90
+ "explanation_text": "If we apply restart_container instead of no_action, latency would change from 600.00 to 510.00 (Δ = -90.00). Based on heuristic causal model.",
91
+ "is_model_based": false,
92
+ "warnings": [
93
+ "Using heuristic causal model (no fitted SCM)."
94
+ ]
95
+ },
96
+ "utility_decision": {
97
+ "best_action": "restart_container",
98
+ "expected_utility": 0.5,
99
+ "explanation": "Heuristic decision based on latency/error thresholds"
100
+ }
101
+ }
102
  ```
103
 
104
+ ### Important Notes
105
+
106
+ - This endpoint is advisory only (`status = oss_advisory_only`)
107
+ - No Structural Causal Model (SCM) is fitted
108
+ - No machine learning models are used
109
+ - All effects are based on predefined heuristics
110
+
111
+ Tests
112
+ -----
113
+
114
+ Run `pytest`. Tests use a temporary SQLite DB (`sqlite:///./test.db`) created by the test fixtures.
115
 
116
+ Notes
117
+ -----
118
 
119
+ - The governance endpoints use an in-process `RiskEngine` initialized at startup.
120
+ - The outcome recording endpoint is not implemented in this repository and returns HTTP 501.
121
 
 
alembic/versions/d36deffe7fa2_add_beta_state_table_for_conjugate_.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """add beta_state table for conjugate posterior persistence
2
+
3
+ Revision ID: d36deffe7fa2
4
+ Revises: b2218948f541
5
+ Create Date: 2026-05-02 20:36:04.870145
6
+
7
+ """
8
+ from typing import Sequence, Union
9
+
10
+ from alembic import op
11
+ import sqlalchemy as sa
12
+
13
+
14
+ # revision identifiers, used by Alembic.
15
+ revision: str = 'd36deffe7fa2'
16
+ down_revision: Union[str, Sequence[str], None] = 'b2218948f541'
17
+ branch_labels: Union[str, Sequence[str], None] = None
18
+ depends_on: Union[str, Sequence[str], None] = None
19
+
20
+
21
+ def upgrade() -> None:
22
+ """Upgrade schema."""
23
+ # ### commands auto generated by Alembic - please adjust! ###
24
+ op.create_table('beta_state',
25
+ sa.Column('id', sa.Integer(), nullable=False),
26
+ sa.Column('category', sa.String(length=32), nullable=False),
27
+ sa.Column('alpha', sa.Float(), nullable=False),
28
+ sa.Column('beta', sa.Float(), nullable=False),
29
+ sa.Column('updated_at', sa.DateTime(), nullable=True),
30
+ sa.PrimaryKeyConstraint('id')
31
+ )
32
+ op.create_index(op.f('ix_beta_state_category'), 'beta_state', ['category'], unique=True)
33
+ op.create_index(op.f('ix_beta_state_id'), 'beta_state', ['id'], unique=False)
34
+ op.add_column('intent_outcomes', sa.Column('idempotency_key', sa.String(length=128), nullable=True))
35
+ op.create_unique_constraint(None, 'intent_outcomes', ['idempotency_key'])
36
+ # ### end Alembic commands ###
37
+
38
+
39
+ def downgrade() -> None:
40
+ """Downgrade schema."""
41
+ # ### commands auto generated by Alembic - please adjust! ###
42
+ op.drop_constraint(None, 'intent_outcomes', type_='unique')
43
+ op.drop_column('intent_outcomes', 'idempotency_key')
44
+ op.drop_index(op.f('ix_beta_state_id'), table_name='beta_state')
45
+ op.drop_index(op.f('ix_beta_state_category'), table_name='beta_state')
46
+ op.drop_table('beta_state')
47
+ # ### end Alembic commands ###
app/api/deps.py CHANGED
@@ -4,66 +4,16 @@ from slowapi import Limiter
4
  from slowapi.util import get_remote_address
5
  from app.core.config import settings
6
 
7
- # ---------------------------------------------------------------------------
8
- # Local dummy implementations that replace the private engine classes.
9
- # They provide the same interface as the originals but perform no real work.
10
- # ---------------------------------------------------------------------------
11
- class RiskEngine:
12
- def __init__(self, *args, **kwargs):
13
- pass
14
- def calculate_risk(self, *args, **kwargs):
15
- return (0.38, "mock", {"conjugate_mean": 0.38})
16
- def update_outcome(self, *args, **kwargs):
17
- pass
18
-
19
- class DecisionEngine:
20
- def __init__(self, *args, **kwargs):
21
- pass
22
- def select_optimal_action(self, *args, **kwargs):
23
- class Result:
24
- best_action = type('Action', (), {'value': 'NO_ACTION'})()
25
- expected_utility = 0.0
26
- alternatives = []
27
- explanation = "mock"
28
- raw_data = {}
29
- return Result()
30
- def compute_risk(self, *args, **kwargs):
31
- return 0.0
32
-
33
- class LyapunovStabilityController:
34
- def __init__(self, *args, **kwargs):
35
- pass
36
-
37
- class CausalExplainer:
38
- def __init__(self, *args, **kwargs):
39
- pass
40
-
41
- class RAGGraphMemory:
42
- def __init__(self, *args, **kwargs):
43
- pass
44
- def has_historical_data(self):
45
- return False
46
- def record_outcome(self, *args, **kwargs):
47
- pass
48
-
49
- class ReliabilityEvent:
50
- def __init__(self, component, latency_p99, error_rate, service_mesh="default"):
51
- self.component = component
52
- self.latency_p99 = latency_p99
53
- self.error_rate = error_rate
54
- self.service_mesh = service_mesh
55
-
56
- class HealingAction:
57
- NO_ACTION = "NO_ACTION"
58
- RESTART_CONTAINER = "RESTART_CONTAINER"
59
- SCALE_OUT = "SCALE_OUT"
60
- ROLLBACK = "ROLLBACK"
61
- CIRCUIT_BREAKER = "CIRCUIT_BREAKER"
62
- TRAFFIC_SHIFT = "TRAFFIC_SHIFT"
63
- ALERT_TEAM = "ALERT_TEAM"
64
- # ---------------------------------------------------------------------------
65
 
66
 
 
67
  def get_db():
68
  db = SessionLocal()
69
  try:
@@ -72,10 +22,14 @@ def get_db():
72
  db.close()
73
 
74
 
75
- limiter = Limiter(key_func=get_remote_address, default_limits=[settings.RATE_LIMIT])
 
 
 
 
76
 
77
 
78
- # Singletons (now using local dummies)
79
  _risk_engine = None
80
  _decision_engine = None
81
  _stability_controller = None
@@ -84,8 +38,36 @@ _rag_graph = None
84
 
85
 
86
  def _seed_rag_graph(rag):
87
- # Mock seed no real data
88
- print("RAG seed skipped (sandbox mode)", file=sys.stderr)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
 
90
 
91
  def get_rag_graph():
@@ -122,4 +104,4 @@ def get_causal_explainer():
122
  global _causal_explainer
123
  if _causal_explainer is None:
124
  _causal_explainer = CausalExplainer()
125
- return _causal_explainer
 
4
  from slowapi.util import get_remote_address
5
  from app.core.config import settings
6
 
7
+ # ARF core engine imports
8
+ from agentic_reliability_framework.core.governance.risk_engine import RiskEngine
9
+ from agentic_reliability_framework.core.decision.decision_engine import DecisionEngine
10
+ from agentic_reliability_framework.core.governance.stability_controller import LyapunovStabilityController
11
+ from agentic_reliability_framework.core.governance.causal_explainer import CausalExplainer
12
+ from agentic_reliability_framework.runtime.memory.rag_graph import RAGGraphMemory
13
+ from agentic_reliability_framework.core.models.event import ReliabilityEvent, HealingAction
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
 
16
+ # Dependency to get DB session
17
  def get_db():
18
  db = SessionLocal()
19
  try:
 
22
  db.close()
23
 
24
 
25
+ # Rate limiter with default limit from settings
26
+ limiter = Limiter(
27
+ key_func=get_remote_address,
28
+ default_limits=[
29
+ settings.RATE_LIMIT])
30
 
31
 
32
+ # ARF engine dependencies (singletons for simplicity)
33
  _risk_engine = None
34
  _decision_engine = None
35
  _stability_controller = None
 
38
 
39
 
40
  def _seed_rag_graph(rag):
41
+ """Seed the RAG graph with historical healing action outcomes."""
42
+ seed_data = [
43
+ ("seed_restart_1", "test", HealingAction.RESTART_CONTAINER.value, True, 2),
44
+ ("seed_restart_2", "test", HealingAction.RESTART_CONTAINER.value, True, 3),
45
+ ("seed_restart_3", "test", HealingAction.RESTART_CONTAINER.value, False, 10),
46
+ ("seed_rollback_1", "test", HealingAction.ROLLBACK.value, True, 1),
47
+ ("seed_rollback_2", "test", HealingAction.ROLLBACK.value, True, 2),
48
+ ("seed_rollback_3", "test", HealingAction.ROLLBACK.value, False, 5),
49
+ ("seed_scale_1", "test", HealingAction.SCALE_OUT.value, True, 5),
50
+ ("seed_scale_2", "test", HealingAction.SCALE_OUT.value, False, 15),
51
+ ("seed_cb_1", "test", HealingAction.CIRCUIT_BREAKER.value, True, 1),
52
+ ("seed_cb_2", "test", HealingAction.CIRCUIT_BREAKER.value, True, 2),
53
+ ("seed_ts_1", "test", HealingAction.TRAFFIC_SHIFT.value, True, 4),
54
+ ("seed_ts_2", "test", HealingAction.TRAFFIC_SHIFT.value, False, 8),
55
+ ]
56
+ for inc_id, comp, action, success, res_time in seed_data:
57
+ event = ReliabilityEvent(
58
+ component=comp,
59
+ latency_p99=500,
60
+ error_rate=0.1,
61
+ service_mesh="default"
62
+ )
63
+ rag.record_outcome(
64
+ incident_id=inc_id,
65
+ event=event,
66
+ action_taken=action,
67
+ success=success,
68
+ resolution_time_minutes=res_time
69
+ )
70
+ print("Seeded RAG graph with historical data", file=sys.stderr)
71
 
72
 
73
  def get_rag_graph():
 
104
  global _causal_explainer
105
  if _causal_explainer is None:
106
  _causal_explainer = CausalExplainer()
107
+ return _causal_explainer
app/api/routes_admin.py CHANGED
@@ -4,25 +4,26 @@ These endpoints should be protected (e.g., by an admin API key) in production.
4
  """
5
  from fastapi import APIRouter, Depends, HTTPException, Query, Path, Body
6
  from pydantic import BaseModel
7
- from typing import Optional, List, Dict, Any
8
  from datetime import datetime
9
  import uuid
10
-
11
  from app.core.usage_tracker import tracker, Tier
12
 
13
  router = APIRouter(prefix="/admin", tags=["admin"])
14
-
15
  # Simple in‑memory admin key (replace with proper auth in production)
16
  ADMIN_API_KEY = "admin_secret_change_me"
17
 
 
18
  def verify_admin(admin_key: str = Query(..., alias="admin_key")):
19
  if admin_key != ADMIN_API_KEY:
20
  raise HTTPException(status_code=403, detail="Invalid admin key")
21
  return True
22
 
 
23
  class CreateKeyRequest(BaseModel):
24
  tier: str
25
 
 
26
  class UpdateTierRequest(BaseModel):
27
  tier: str
28
 
@@ -30,20 +31,20 @@ class UpdateTierRequest(BaseModel):
30
  @router.post("/keys", dependencies=[Depends(verify_admin)])
31
  async def create_api_key(req: CreateKeyRequest):
32
  if req.tier not in [t.value for t in Tier]:
33
- raise HTTPException(status_code=400, detail=f"Invalid tier. Must be one of {[t.value for t in Tier]}")
 
34
  new_key = f"sk_live_{uuid.uuid4().hex[:24]}"
35
  tier_enum = Tier(req.tier)
36
  tracker.get_or_create_api_key(new_key, tier_enum)
37
  return {"api_key": new_key, "tier": req.tier}
38
 
39
 
40
- @router.get("/keys", dependencies=[Depends(verify_admin)])
41
  async def list_api_keys(limit: int = 100, offset: int = 0):
42
  with tracker._get_conn() as conn:
43
  rows = conn.execute(
44
- "SELECT key, tier, created_at, last_used_at, is_active FROM api_keys ORDER BY created_at DESC LIMIT ? OFFSET ?",
45
  (limit, offset)
46
- ).fetchall()
47
  keys = []
48
  for row in rows:
49
  month = tracker._get_month_key()
@@ -52,14 +53,18 @@ async def list_api_keys(limit: int = 100, offset: int = 0):
52
  (row["key"], month)
53
  ).fetchone()
54
  usage = usage_row["count"] if usage_row else 0
55
- keys.append({
56
- "key": row["key"],
57
- "tier": row["tier"],
58
- "created_at": datetime.fromtimestamp(row["created_at"]).isoformat(),
59
- "last_used_at": datetime.fromtimestamp(row["last_used_at"]).isoformat() if row["last_used_at"] else None,
60
- "is_active": bool(row["is_active"]),
61
- "current_month_usage": usage,
62
- })
 
 
 
 
63
  return {"keys": keys, "total": len(keys)}
64
 
65
 
@@ -69,28 +74,33 @@ async def update_key_tier(
69
  req: UpdateTierRequest = Body(...),
70
  ):
71
  if req.tier not in [t.value for t in Tier]:
72
- raise HTTPException(status_code=400, detail=f"Invalid tier. Must be one of {[t.value for t in Tier]}")
 
73
  with tracker._get_conn() as conn:
74
- row = conn.execute("SELECT key FROM api_keys WHERE key = ?", (api_key,)).fetchone()
 
75
  if not row:
76
  raise HTTPException(status_code=404, detail="API key not found")
77
- conn.execute("UPDATE api_keys SET tier = ? WHERE key = ?", (req.tier, api_key))
 
78
  conn.commit()
79
  return {"message": f"Tier updated to {req.tier}"}
80
 
81
 
82
  @router.delete("/keys/{api_key}", dependencies=[Depends(verify_admin)])
83
- async def deactivate_api_key(api_key: str = Path(..., description="The API key to deactivate")):
 
84
  with tracker._get_conn() as conn:
85
- row = conn.execute("SELECT key FROM api_keys WHERE key = ?", (api_key,)).fetchone()
 
86
  if not row:
87
  raise HTTPException(status_code=404, detail="API key not found")
88
- conn.execute("UPDATE api_keys SET is_active = 0 WHERE key = ?", (api_key,))
 
89
  conn.commit()
90
  return {"message": "API key deactivated"}
91
 
92
 
93
- @router.get("/audit/{api_key}", dependencies=[Depends(verify_admin)])
94
  async def get_audit_logs(
95
  api_key: str = Path(..., description="The API key to audit"),
96
  start_date: Optional[str] = Query(None),
@@ -103,11 +113,12 @@ async def get_audit_logs(
103
  return {"api_key": api_key, "logs": logs}
104
 
105
 
106
- @router.get("/stats", dependencies=[Depends(verify_admin)])
107
  async def get_global_stats():
108
  with tracker._get_conn() as conn:
109
- total_keys = conn.execute("SELECT COUNT(*) FROM api_keys WHERE is_active = 1").fetchone()[0]
110
- total_requests = conn.execute("SELECT COUNT(*) FROM usage_log").fetchone()[0]
 
 
111
  by_tier = conn.execute(
112
  "SELECT tier, COUNT(*) as count FROM usage_log GROUP BY tier"
113
  ).fetchall()
 
4
  """
5
  from fastapi import APIRouter, Depends, HTTPException, Query, Path, Body
6
  from pydantic import BaseModel
7
+ from typing import Optional
8
  from datetime import datetime
9
  import uuid
 
10
  from app.core.usage_tracker import tracker, Tier
11
 
12
  router = APIRouter(prefix="/admin", tags=["admin"])
 
13
  # Simple in‑memory admin key (replace with proper auth in production)
14
  ADMIN_API_KEY = "admin_secret_change_me"
15
 
16
+
17
  def verify_admin(admin_key: str = Query(..., alias="admin_key")):
18
  if admin_key != ADMIN_API_KEY:
19
  raise HTTPException(status_code=403, detail="Invalid admin key")
20
  return True
21
 
22
+
23
  class CreateKeyRequest(BaseModel):
24
  tier: str
25
 
26
+
27
  class UpdateTierRequest(BaseModel):
28
  tier: str
29
 
 
31
  @router.post("/keys", dependencies=[Depends(verify_admin)])
32
  async def create_api_key(req: CreateKeyRequest):
33
  if req.tier not in [t.value for t in Tier]:
34
+ raise HTTPException(
35
+ status_code=400, detail=f"Invalid tier. Must be one of {[t.value for t in Tier]}")
36
  new_key = f"sk_live_{uuid.uuid4().hex[:24]}"
37
  tier_enum = Tier(req.tier)
38
  tracker.get_or_create_api_key(new_key, tier_enum)
39
  return {"api_key": new_key, "tier": req.tier}
40
 
41
 
 
42
  async def list_api_keys(limit: int = 100, offset: int = 0):
43
  with tracker._get_conn() as conn:
44
  rows = conn.execute(
45
+ "SELECT key, tier, created_at, last_used_at, is_active FROM api_keys ORDER BY created_at DESC LIMIT ? OFFSET ?", # noqa: E501
46
  (limit, offset)
47
+ ).fetchall() # noqa: E501
48
  keys = []
49
  for row in rows:
50
  month = tracker._get_month_key()
 
53
  (row["key"], month)
54
  ).fetchone()
55
  usage = usage_row["count"] if usage_row else 0
56
+ keys.append(
57
+ {
58
+ "key": row["key"],
59
+ "tier": row["tier"],
60
+ "created_at": datetime.fromtimestamp(
61
+ row["created_at"]).isoformat(),
62
+ "last_used_at": datetime.fromtimestamp(
63
+ row["last_used_at"]).isoformat() if row["last_used_at"] else None,
64
+ "is_active": bool(
65
+ row["is_active"]),
66
+ "current_month_usage": usage,
67
+ })
68
  return {"keys": keys, "total": len(keys)}
69
 
70
 
 
74
  req: UpdateTierRequest = Body(...),
75
  ):
76
  if req.tier not in [t.value for t in Tier]:
77
+ raise HTTPException(
78
+ status_code=400, detail=f"Invalid tier. Must be one of {[t.value for t in Tier]}")
79
  with tracker._get_conn() as conn:
80
+ row = conn.execute(
81
+ "SELECT key FROM api_keys WHERE key = ?", (api_key,)).fetchone()
82
  if not row:
83
  raise HTTPException(status_code=404, detail="API key not found")
84
+ conn.execute("UPDATE api_keys SET tier = ? WHERE key = ?",
85
+ (req.tier, api_key))
86
  conn.commit()
87
  return {"message": f"Tier updated to {req.tier}"}
88
 
89
 
90
  @router.delete("/keys/{api_key}", dependencies=[Depends(verify_admin)])
91
+ async def deactivate_api_key(
92
+ api_key: str = Path(..., description="The API key to deactivate")):
93
  with tracker._get_conn() as conn:
94
+ row = conn.execute(
95
+ "SELECT key FROM api_keys WHERE key = ?", (api_key,)).fetchone()
96
  if not row:
97
  raise HTTPException(status_code=404, detail="API key not found")
98
+ conn.execute(
99
+ "UPDATE api_keys SET is_active = 0 WHERE key = ?", (api_key,))
100
  conn.commit()
101
  return {"message": "API key deactivated"}
102
 
103
 
 
104
  async def get_audit_logs(
105
  api_key: str = Path(..., description="The API key to audit"),
106
  start_date: Optional[str] = Query(None),
 
113
  return {"api_key": api_key, "logs": logs}
114
 
115
 
 
116
  async def get_global_stats():
117
  with tracker._get_conn() as conn:
118
+ total_keys = conn.execute(
119
+ "SELECT COUNT(*) FROM api_keys WHERE is_active = 1").fetchone()[0]
120
+ total_requests = conn.execute(
121
+ "SELECT COUNT(*) FROM usage_log").fetchone()[0]
122
  by_tier = conn.execute(
123
  "SELECT tier, COUNT(*) as count FROM usage_log GROUP BY tier"
124
  ).fetchall()
app/api/routes_governance.py CHANGED
@@ -1,4 +1,4 @@
1
- from fastapi import APIRouter, Depends, HTTPException, Request, BackgroundTasks
2
  from fastapi.encoders import jsonable_encoder
3
  from sqlalchemy.orm import Session
4
  from app.models.infrastructure_intents import InfrastructureIntentRequest
@@ -8,26 +8,34 @@ from app.services.intent_store import save_evaluated_intent
8
  from app.services.outcome_service import record_outcome
9
  from app.api.deps import get_db
10
  from pydantic import BaseModel
11
- from typing import Optional
12
  import uuid
13
  import logging
14
  import time
 
 
 
15
 
16
- # Optional import from protected core engine – not available in public Spaces
 
 
 
 
17
  try:
18
- from agentic_reliability_framework.core.models.event import ReliabilityEvent
 
19
  except ImportError:
20
- # Local fallback for public sandbox deployments
21
- class ReliabilityEvent(BaseModel):
22
- component: str
23
- latency_p99: float
24
- error_rate: float
25
- service_mesh: str = "default"
26
- cpu_util: Optional[float] = None
27
- memory_util: Optional[float] = None
28
 
29
- # ===== USAGE TRACKER IMPORTS =====
30
- from app.core.usage_tracker import enforce_quota, UsageRecord, tracker
 
 
 
 
 
 
 
31
 
32
  logger = logging.getLogger(__name__)
33
  router = APIRouter()
@@ -50,13 +58,52 @@ async def evaluate_intent_endpoint(
50
  intent_req: InfrastructureIntentRequest,
51
  background_tasks: BackgroundTasks,
52
  db: Session = Depends(get_db),
53
- quota: dict = Depends(enforce_quota)
54
  ):
 
 
 
 
 
 
 
 
 
 
55
  start_time = time.time()
56
- api_key = quota["api_key"]
57
- tier = quota["tier"]
58
- response_data = None
59
- error_msg = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
 
61
  try:
62
  oss_intent = to_oss_intent(intent_req)
@@ -68,6 +115,10 @@ async def evaluate_intent_endpoint(
68
  policy_violations=intent_req.policy_violations
69
  )
70
 
 
 
 
 
71
  deterministic_id = str(uuid.uuid4())
72
  api_payload = jsonable_encoder(intent_req.model_dump())
73
  oss_payload = jsonable_encoder(oss_intent.model_dump())
@@ -85,36 +136,39 @@ async def evaluate_intent_endpoint(
85
  result["intent_id"] = deterministic_id
86
  response_data = result
87
 
88
- if tracker:
89
- record = UsageRecord(
90
- api_key=api_key,
91
- tier=tier,
92
- timestamp=time.time(),
93
- endpoint="/api/v1/intents/evaluate",
94
- request_body=intent_req.model_dump(),
95
- response=response_data,
96
- processing_ms=(time.time() - start_time) * 1000,
 
 
 
97
  )
98
- await tracker.increment_usage_async(record, background_tasks)
 
 
 
 
99
 
100
  return response_data
101
 
102
  except HTTPException:
 
 
 
103
  raise
104
  except Exception as e:
105
  error_msg = str(e)
106
  logger.exception("Error in evaluate_intent_endpoint")
107
- if tracker:
108
- record = UsageRecord(
109
- api_key=api_key,
110
- tier=tier,
111
- timestamp=time.time(),
112
- endpoint="/api/v1/intents/evaluate",
113
- request_body=intent_req.model_dump(),
114
- error=error_msg,
115
- processing_ms=(time.time() - start_time) * 1000,
116
- )
117
- await tracker.increment_usage_async(record, background_tasks)
118
  raise HTTPException(status_code=500, detail=error_msg)
119
 
120
 
@@ -122,9 +176,14 @@ async def evaluate_intent_endpoint(
122
  async def record_outcome_endpoint(
123
  request: Request,
124
  outcome: OutcomeRequest,
125
- db: Session = Depends(get_db)
 
126
  ):
127
- # No usage tracking for outcomes (doesn't count against quota)
 
 
 
 
128
  try:
129
  risk_engine = request.app.state.risk_engine
130
  outcome_record = record_outcome(
@@ -133,8 +192,27 @@ async def record_outcome_endpoint(
133
  success=outcome.success,
134
  recorded_by=outcome.recorded_by,
135
  notes=outcome.notes,
136
- risk_engine=risk_engine
 
137
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
  return {"message": "Outcome recorded", "outcome_id": outcome_record.id}
139
  except Exception as e:
140
  raise HTTPException(status_code=500, detail=str(e))
@@ -145,13 +223,51 @@ async def evaluate_healing_decision_endpoint(
145
  request: Request,
146
  decision_req: HealingDecisionRequest,
147
  background_tasks: BackgroundTasks,
148
- quota: dict = Depends(enforce_quota)
149
  ):
 
 
 
 
 
 
 
 
 
150
  start_time = time.time()
151
- api_key = quota["api_key"]
152
- tier = quota["tier"]
153
- response_data = None
154
- error_msg = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
 
156
  try:
157
  policy_engine = request.app.state.policy_engine
@@ -168,34 +284,37 @@ async def evaluate_healing_decision_endpoint(
168
  tokenizer=tokenizer,
169
  )
170
 
171
- if tracker:
172
- record = UsageRecord(
173
- api_key=api_key,
174
- tier=tier,
175
- timestamp=time.time(),
176
- endpoint="/api/v1/healing/evaluate",
177
- request_body=decision_req.model_dump(),
178
- response=response_data,
179
- processing_ms=(time.time() - start_time) * 1000,
180
- )
181
- await tracker.increment_usage_async(record, background_tasks)
182
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
  return response_data
184
 
185
  except HTTPException:
 
 
 
186
  raise
187
  except Exception as e:
188
  error_msg = str(e)
189
  logger.exception("Error in evaluate_healing_decision_endpoint")
190
- if tracker:
191
- record = UsageRecord(
192
- api_key=api_key,
193
- tier=tier,
194
- timestamp=time.time(),
195
- endpoint="/api/v1/healing/evaluate",
196
- request_body=decision_req.model_dump(),
197
- error=error_msg,
198
- processing_ms=(time.time() - start_time) * 1000,
199
- )
200
- await tracker.increment_usage_async(record, background_tasks)
201
- raise HTTPException(status_code=500, detail=error_msg)
 
1
+ from fastapi import APIRouter, Depends, HTTPException, Request, BackgroundTasks, Header
2
  from fastapi.encoders import jsonable_encoder
3
  from sqlalchemy.orm import Session
4
  from app.models.infrastructure_intents import InfrastructureIntentRequest
 
8
  from app.services.outcome_service import record_outcome
9
  from app.api.deps import get_db
10
  from pydantic import BaseModel
 
11
  import uuid
12
  import logging
13
  import time
14
+ from typing import Optional
15
+
16
+ from agentic_reliability_framework.core.models.event import ReliabilityEvent
17
 
18
+ # ===== USAGE TRACKER IMPORTS =====
19
+ import app.core.usage_tracker
20
+ from app.core.usage_tracker import UsageRecord
21
+
22
+ # ===== PRICING CALCULATOR INTEGRATION =====
23
  try:
24
+ from arf_pricing_calculator.storage.buffer import add_event
25
+ PRICING_AVAILABLE = True
26
  except ImportError:
27
+ PRICING_AVAILABLE = False
28
+ add_event = None
 
 
 
 
 
 
29
 
30
+ # ===== OpenTelemetry (optional) =====
31
+ try:
32
+ from opentelemetry import trace
33
+ from opentelemetry.trace import Status, StatusCode
34
+ _tracer = trace.get_tracer(__name__)
35
+ OTEL_AVAILABLE = True
36
+ except ImportError:
37
+ OTEL_AVAILABLE = False
38
+ _tracer = None
39
 
40
  logger = logging.getLogger(__name__)
41
  router = APIRouter()
 
58
  intent_req: InfrastructureIntentRequest,
59
  background_tasks: BackgroundTasks,
60
  db: Session = Depends(get_db),
61
+ idempotency_key: Optional[str] = Header(None, alias="Idempotency-Key"),
62
  ):
63
+ """
64
+ Evaluate an infrastructure intent with idempotency and atomic quota consumption.
65
+ """
66
+ # ── optional trace ──────────────────────────────────────
67
+ span = None
68
+ if OTEL_AVAILABLE and _tracer:
69
+ span = _tracer.start_span("governance.evaluate_intent")
70
+ span.set_attribute("intent_type", intent_req.intent_type)
71
+ span.set_attribute("environment", str(intent_req.environment))
72
+
73
  start_time = time.time()
74
+ api_key = request.headers.get("Authorization", "").replace("Bearer ", "")
75
+ if not api_key:
76
+ api_key = request.query_params.get("api_key", "unknown")
77
+
78
+ current_tracker = app.core.usage_tracker.tracker
79
+ if current_tracker is None:
80
+ if span:
81
+ span.set_status(Status(StatusCode.ERROR, "tracker unavailable"))
82
+ span.end()
83
+ raise HTTPException(status_code=503,
84
+ detail="Usage tracking service unavailable")
85
+
86
+ record = UsageRecord(
87
+ api_key=api_key,
88
+ tier=None,
89
+ timestamp=start_time,
90
+ endpoint="/api/v1/intents/evaluate",
91
+ request_body=intent_req.model_dump(),
92
+ processing_ms=None,
93
+ )
94
+ success, existing_response = current_tracker.consume_quota_and_log(
95
+ record=record,
96
+ idempotency_key=idempotency_key
97
+ )
98
+ if not success:
99
+ if span:
100
+ span.set_attribute("idempotent_hit", True if existing_response else False)
101
+ span.end()
102
+ if existing_response:
103
+ return existing_response
104
+ else:
105
+ raise HTTPException(status_code=429,
106
+ detail="Monthly evaluation quota exceeded")
107
 
108
  try:
109
  oss_intent = to_oss_intent(intent_req)
 
115
  policy_violations=intent_req.policy_violations
116
  )
117
 
118
+ if span:
119
+ span.set_attribute("risk_score", result["risk_score"])
120
+ span.set_attribute("deterministic_id", str(uuid.uuid4())) # will be overwritten later, but fine for trace
121
+
122
  deterministic_id = str(uuid.uuid4())
123
  api_payload = jsonable_encoder(intent_req.model_dump())
124
  oss_payload = jsonable_encoder(oss_intent.model_dump())
 
136
  result["intent_id"] = deterministic_id
137
  response_data = result
138
 
139
+ if current_tracker:
140
+ background_tasks.add_task(
141
+ current_tracker._insert_audit_log,
142
+ UsageRecord(
143
+ api_key=api_key,
144
+ tier=None,
145
+ timestamp=time.time(),
146
+ endpoint="/api/v1/intents/evaluate/response",
147
+ request_body=None,
148
+ response=response_data,
149
+ processing_ms=(time.time() - start_time) * 1000,
150
+ )
151
  )
152
+
153
+ if span:
154
+ span.set_attribute("intent_id", deterministic_id)
155
+ span.set_status(Status(StatusCode.OK))
156
+ span.end()
157
 
158
  return response_data
159
 
160
  except HTTPException:
161
+ if span:
162
+ span.set_status(Status(StatusCode.ERROR, "HTTP exception"))
163
+ span.end()
164
  raise
165
  except Exception as e:
166
  error_msg = str(e)
167
  logger.exception("Error in evaluate_intent_endpoint")
168
+ if span:
169
+ span.set_status(Status(StatusCode.ERROR, error_msg))
170
+ span.record_exception(e)
171
+ span.end()
 
 
 
 
 
 
 
172
  raise HTTPException(status_code=500, detail=error_msg)
173
 
174
 
 
176
  async def record_outcome_endpoint(
177
  request: Request,
178
  outcome: OutcomeRequest,
179
+ db: Session = Depends(get_db),
180
+ idempotency_key: Optional[str] = Header(None, alias="Idempotency-Key"),
181
  ):
182
+ """
183
+ Record an outcome for a previously evaluated intent.
184
+ Idempotent based on deterministic_id and success value (handled in service).
185
+ Also updates the pricing calculator's calibration buffer if available.
186
+ """
187
  try:
188
  risk_engine = request.app.state.risk_engine
189
  outcome_record = record_outcome(
 
192
  success=outcome.success,
193
  recorded_by=outcome.recorded_by,
194
  notes=outcome.notes,
195
+ risk_engine=risk_engine,
196
+ idempotency_key=idempotency_key,
197
  )
198
+
199
+ if PRICING_AVAILABLE and add_event is not None:
200
+ try:
201
+ event = {
202
+ "run_id": outcome.deterministic_id,
203
+ "outcome": "success" if outcome.success else "failure",
204
+ "recorded_at": time.time(),
205
+ "source": "arf_api_outcome"
206
+ }
207
+ add_event(event)
208
+ logger.info(
209
+ f"Added outcome to pricing buffer for intent {
210
+ outcome.deterministic_id}")
211
+ except Exception as e:
212
+ logger.warning(
213
+ f"Failed to update pricing buffer for intent {
214
+ outcome.deterministic_id}: {e}")
215
+
216
  return {"message": "Outcome recorded", "outcome_id": outcome_record.id}
217
  except Exception as e:
218
  raise HTTPException(status_code=500, detail=str(e))
 
223
  request: Request,
224
  decision_req: HealingDecisionRequest,
225
  background_tasks: BackgroundTasks,
226
+ idempotency_key: Optional[str] = Header(None, alias="Idempotency-Key"),
227
  ):
228
+ """
229
+ Evaluate a healing decision with idempotency and atomic quota consumption.
230
+ """
231
+ # ── optional trace ──────────────────────────────────────
232
+ span = None
233
+ if OTEL_AVAILABLE and _tracer:
234
+ span = _tracer.start_span("governance.evaluate_healing")
235
+ span.set_attribute("component", decision_req.event.component)
236
+
237
  start_time = time.time()
238
+ api_key = request.headers.get("Authorization", "").replace("Bearer ", "")
239
+ if not api_key:
240
+ api_key = request.query_params.get("api_key", "unknown")
241
+
242
+ current_tracker = app.core.usage_tracker.tracker
243
+ if current_tracker is None:
244
+ if span:
245
+ span.set_status(Status(StatusCode.ERROR, "tracker unavailable"))
246
+ span.end()
247
+ raise HTTPException(status_code=503,
248
+ detail="Usage tracking service unavailable")
249
+
250
+ record = UsageRecord(
251
+ api_key=api_key,
252
+ tier=None,
253
+ timestamp=start_time,
254
+ endpoint="/api/v1/healing/evaluate",
255
+ request_body=decision_req.model_dump(),
256
+ processing_ms=None,
257
+ )
258
+ success, existing_response = current_tracker.consume_quota_and_log(
259
+ record=record,
260
+ idempotency_key=idempotency_key
261
+ )
262
+ if not success:
263
+ if span:
264
+ span.set_attribute("idempotent_hit", True if existing_response else False)
265
+ span.end()
266
+ if existing_response:
267
+ return existing_response
268
+ else:
269
+ raise HTTPException(status_code=429,
270
+ detail="Monthly evaluation quota exceeded")
271
 
272
  try:
273
  policy_engine = request.app.state.policy_engine
 
284
  tokenizer=tokenizer,
285
  )
286
 
287
+ if span:
288
+ span.set_attribute("risk_score", response_data.get("risk_score", 0.0))
289
+ span.set_attribute("selected_action", response_data.get("selected_action", "unknown"))
290
+ span.set_status(Status(StatusCode.OK))
291
+ span.end()
 
 
 
 
 
 
292
 
293
+ if current_tracker:
294
+ background_tasks.add_task(
295
+ current_tracker._insert_audit_log,
296
+ UsageRecord(
297
+ api_key=api_key,
298
+ tier=None,
299
+ timestamp=time.time(),
300
+ endpoint="/api/v1/healing/evaluate/response",
301
+ request_body=None,
302
+ response=response_data,
303
+ processing_ms=(time.time() - start_time) * 1000,
304
+ )
305
+ )
306
  return response_data
307
 
308
  except HTTPException:
309
+ if span:
310
+ span.set_status(Status(StatusCode.ERROR, "HTTP exception"))
311
+ span.end()
312
  raise
313
  except Exception as e:
314
  error_msg = str(e)
315
  logger.exception("Error in evaluate_healing_decision_endpoint")
316
+ if span:
317
+ span.set_status(Status(StatusCode.ERROR, error_msg))
318
+ span.record_exception(e)
319
+ span.end()
320
+ raise HTTPException(status_code=500, detail=error_msg)
 
 
 
 
 
 
 
app/api/routes_incidents.py CHANGED
@@ -1,86 +1,211 @@
1
- from app.causal_explainer import CausalExplainer
2
- from fastapi import APIRouter, Depends, Request, BackgroundTasks, HTTPException
3
- from pydantic import BaseModel
4
- from typing import Optional
5
- from enum import Enum
6
- import time
7
- import json
 
 
 
 
 
 
 
 
 
 
 
8
 
9
- # ===== USAGE TRACKER IMPORTS =====
10
- from app.core.usage_tracker import enforce_quota, UsageRecord, tracker
11
 
 
 
 
12
 
13
- class HealingAction(str, Enum):
14
- NO_ACTION = "no_action"
15
- RESTART_CONTAINER = "restart_container"
16
- SCALE_OUT = "scale_out"
17
- ROLLBACK = "rollback"
18
- CIRCUIT_BREAKER = "circuit_breaker"
19
- TRAFFIC_SHIFT = "traffic_shift"
20
- ALERT_TEAM = "alert_team"
21
 
 
 
 
 
22
 
23
- class ReliabilityEvent(BaseModel):
24
- component: str
25
- latency_p99: float
26
- error_rate: float
27
- service_mesh: str = "default"
28
- cpu_util: Optional[float] = None
29
- memory_util: Optional[float] = None
30
 
 
31
 
32
  router = APIRouter()
33
- incident_history = []
34
 
 
 
 
 
35
 
 
 
 
 
36
  @router.post("/report_incident")
37
- async def report_incident(event: ReliabilityEvent):
38
- incident_history.append(event.dict())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  return {"status": "recorded"}
40
 
41
 
 
 
 
42
  @router.post("/v1/incidents/evaluate")
43
  async def evaluate_incident(
44
  request: Request,
45
  event: ReliabilityEvent,
46
  background_tasks: BackgroundTasks,
47
- quota: dict = Depends(enforce_quota)
48
- ):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  start_time = time.time()
50
- api_key = quota["api_key"]
51
  tier = quota["tier"]
52
- response_data = None
53
- error_msg = None
54
 
55
  try:
56
- # Simple risk score (heuristic)
57
- risk_score = min(1.0, (event.latency_p99 / 1000.0) * 0.7 + event.error_rate * 0.3)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
 
59
- if event.latency_p99 > 500 or event.error_rate > 0.15:
60
- optimal_action = HealingAction.RESTART_CONTAINER
61
- else:
62
- optimal_action = HealingAction.NO_ACTION
 
 
 
 
 
 
63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  current_state = {
65
  "latency": event.latency_p99,
66
  "error_rate": event.error_rate,
67
- "last_action": {"action_type": "no_action"}
68
  }
69
  proposed_action = {"action_type": optimal_action.value, "params": {}}
70
- ce = CausalExplainer()
71
- causal_exp = ce.explain_healing_intent(proposed_action, current_state, "latency")
 
72
 
 
 
 
73
  healing_intent = {
74
  "action": optimal_action.value,
75
  "component": event.component,
76
- "parameters": proposed_action["params"],
77
- "justification": f"Causal: {causal_exp.explanation_text}",
78
- "confidence": 0.85,
79
- "risk_score": risk_score,
80
- "status": "oss_advisory_only"
 
 
 
81
  }
82
 
83
  response_data = {
 
 
 
 
84
  "healing_intent": healing_intent,
85
  "causal_explanation": {
86
  "factual_outcome": causal_exp.factual_outcome,
@@ -88,42 +213,49 @@ async def evaluate_incident(
88
  "effect": causal_exp.effect,
89
  "explanation_text": causal_exp.explanation_text,
90
  "is_model_based": causal_exp.is_model_based,
91
- "warnings": causal_exp.warnings
92
  },
93
  "utility_decision": {
94
  "best_action": optimal_action.value,
95
  "expected_utility": 0.5,
96
- "explanation": "Heuristic decision based on latency/error thresholds"
97
- }
 
 
98
  }
99
 
 
100
  # Asynchronous usage logging
 
101
  if tracker:
102
  record = UsageRecord(
103
  api_key=api_key,
104
  tier=tier,
105
  timestamp=time.time(),
106
  endpoint="/v1/incidents/evaluate",
107
- request_body=event.dict(),
108
  response=response_data,
109
  processing_ms=(time.time() - start_time) * 1000,
110
  )
111
  await tracker.increment_usage_async(record, background_tasks)
112
 
 
 
 
 
113
  return response_data
114
 
115
  except HTTPException:
116
  raise
117
- except Exception as e:
118
- error_msg = str(e)
119
- # Log failure in background
120
  if tracker:
121
  record = UsageRecord(
122
  api_key=api_key,
123
  tier=tier,
124
  timestamp=time.time(),
125
  endpoint="/v1/incidents/evaluate",
126
- request_body=event.dict(),
127
  error=error_msg,
128
  processing_ms=(time.time() - start_time) * 1000,
129
  )
 
1
+ """
2
+ Incident evaluation endpoints backward‑compatible Bayesian reroute.
3
+
4
+ This module provides two incident‑related routes:
5
+
6
+ * ``POST /api/v1/report_incident``
7
+ Stores a ``ReliabilityEvent`` in an in‑memory history for auditing
8
+ and debugging.
9
+ * ``POST /api/v1/v1/incidents/evaluate`` **(deprecated)**
10
+ Former heuristic endpoint now **rerouted to the full Bayesian risk
11
+ engine**. All callers should migrate to
12
+ ``POST /api/v1/intents/evaluate``, which returns richer metadata
13
+ including CUDL uncertainty decomposition and decision traces.
14
+
15
+ The local model duplicates (``ReliabilityEvent``, ``HealingAction``)
16
+ have been removed; all types are imported from the canonical ARF core
17
+ framework (``agentic_reliability_framework.core.models.event``).
18
+ """
19
 
20
+ from __future__ import annotations
 
21
 
22
+ import logging
23
+ import time
24
+ from typing import Optional
25
 
26
+ from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException, Request
 
 
 
 
 
 
 
27
 
28
+ from agentic_reliability_framework.core.models.event import (
29
+ HealingAction,
30
+ ReliabilityEvent,
31
+ )
32
 
33
+ from app.causal_explainer import CausalExplainer
34
+ from app.core.usage_tracker import UsageRecord, enforce_quota, tracker
 
 
 
 
 
35
 
36
+ logger = logging.getLogger(__name__)
37
 
38
  router = APIRouter()
 
39
 
40
+ # ---------------------------------------------------------------------------
41
+ # In‑memory incident store (for auditing / debugging only)
42
+ # ---------------------------------------------------------------------------
43
+ incident_history: list[dict] = []
44
 
45
+
46
+ # ---------------------------------------------------------------------------
47
+ # POST /api/v1/report_incident
48
+ # ---------------------------------------------------------------------------
49
  @router.post("/report_incident")
50
+ async def report_incident(event: ReliabilityEvent) -> dict[str, str]:
51
+ """
52
+ Record a ``ReliabilityEvent`` in the in‑memory incident history.
53
+
54
+ This endpoint is used by internal monitoring tools to feed incident
55
+ data into the causal explainer and downstream analysis. The event
56
+ is stored as a JSON‑safe dictionary and is **not** persisted across
57
+ API restarts.
58
+
59
+ Parameters
60
+ ----------
61
+ event : ReliabilityEvent
62
+ The reliability event to record. Must include at minimum
63
+ ``component``, ``latency_p99``, ``error_rate``, and
64
+ ``service_mesh``.
65
+
66
+ Returns
67
+ -------
68
+ dict
69
+ A simple acknowledgement ``{"status": "recorded"}``.
70
+ """
71
+ incident_history.append(event.model_dump(mode="json"))
72
  return {"status": "recorded"}
73
 
74
 
75
+ # ---------------------------------------------------------------------------
76
+ # POST /api/v1/v1/incidents/evaluate (deprecated)
77
+ # ---------------------------------------------------------------------------
78
  @router.post("/v1/incidents/evaluate")
79
  async def evaluate_incident(
80
  request: Request,
81
  event: ReliabilityEvent,
82
  background_tasks: BackgroundTasks,
83
+ quota: dict = Depends(enforce_quota),
84
+ ) -> dict:
85
+ """
86
+ Evaluate an incident using the **Bayesian risk engine**.
87
+
88
+ .. deprecated:: 0.6.0
89
+ Use ``POST /api/v1/intents/evaluate`` instead. This endpoint
90
+ will be removed in a future release. Responses include a
91
+ ``deprecation_notice`` field to assist migration.
92
+
93
+ The following steps are performed:
94
+
95
+ 1. Convert the ``ReliabilityEvent`` into a minimal
96
+ ``DeployConfigurationIntent`` via ``intent_adapter``.
97
+ 2. Call ``risk_service.evaluate_intent()`` to obtain a Bayesian
98
+ risk score.
99
+ 3. Generate a heuristic healing action based on the risk score.
100
+ 4. Run the causal explainer for counter‑factual text.
101
+ 5. Build a backward‑compatible response envelope.
102
+
103
+ Parameters
104
+ ----------
105
+ request : Request
106
+ The Starlette request object (used for internal state access).
107
+ event : ReliabilityEvent
108
+ The incident event containing component name, latency, error
109
+ rate, etc.
110
+ background_tasks : BackgroundTasks
111
+ FastAPI background‑task runner for asynchronous logging.
112
+ quota : dict
113
+ Injected by ``enforce_quota``; contains ``api_key``, ``tier``,
114
+ and ``remaining``.
115
+
116
+ Returns
117
+ -------
118
+ dict
119
+ A dictionary with keys:
120
+
121
+ * ``deprecation_notice`` (str) — migration guidance.
122
+ * ``healing_intent`` (dict) — action, component, risk score,
123
+ justification, confidence, and advisory status.
124
+ * ``causal_explanation`` (dict) — factual/counter‑factual
125
+ outcomes and explanation text.
126
+ * ``utility_decision`` (dict) — selected action and expected
127
+ utility.
128
+ """
129
  start_time = time.time()
130
+ api_key: str = quota["api_key"]
131
  tier = quota["tier"]
132
+ response_data: Optional[dict] = None
133
+ error_msg: Optional[str] = None
134
 
135
  try:
136
+ # ------------------------------------------------------------------
137
+ # Step 1 Convert the event into an infrastructure intent
138
+ # ------------------------------------------------------------------
139
+ from app.services.intent_adapter import to_oss_intent
140
+ from app.services.risk_service import evaluate_intent
141
+
142
+ raw_intent = {
143
+ "intent_type": "deploy_config",
144
+ "environment": "prod",
145
+ "service_name": event.component,
146
+ "requester": "auto",
147
+ "change_scope": "global",
148
+ "deployment_target": "prod",
149
+ "configuration": {},
150
+ "provenance": {"source": "incident_evaluate"},
151
+ }
152
+ oss_intent = to_oss_intent(raw_intent)
153
 
154
+ # ------------------------------------------------------------------
155
+ # Step 2 – Bayesian risk evaluation
156
+ # ------------------------------------------------------------------
157
+ risk_engine = request.app.state.risk_engine
158
+ result = evaluate_intent(
159
+ engine=risk_engine,
160
+ intent=oss_intent,
161
+ cost_estimate=None,
162
+ policy_violations=[],
163
+ )
164
 
165
+ # ------------------------------------------------------------------
166
+ # Step 3 – Heuristic action selection based on risk threshold
167
+ # ------------------------------------------------------------------
168
+ optimal_action = (
169
+ HealingAction.RESTART_CONTAINER
170
+ if result["risk_score"] > 0.5
171
+ else HealingAction.NO_ACTION
172
+ )
173
+
174
+ # ------------------------------------------------------------------
175
+ # Step 4 – Causal explainer
176
+ # ------------------------------------------------------------------
177
+ causal_explainer = CausalExplainer()
178
  current_state = {
179
  "latency": event.latency_p99,
180
  "error_rate": event.error_rate,
181
+ "last_action": {"action_type": "no_action"},
182
  }
183
  proposed_action = {"action_type": optimal_action.value, "params": {}}
184
+ causal_exp = causal_explainer.explain_healing_intent(
185
+ proposed_action, current_state, "latency"
186
+ )
187
 
188
+ # ------------------------------------------------------------------
189
+ # Step 5 – Build response envelope
190
+ # ------------------------------------------------------------------
191
  healing_intent = {
192
  "action": optimal_action.value,
193
  "component": event.component,
194
+ "parameters": {},
195
+ "justification": (
196
+ f"Bayesian risk score: {result['risk_score']:.3f}. "
197
+ f"Causal: {causal_exp.explanation_text}"
198
+ ),
199
+ "confidence": 1.0 - result.get("uncertainty", 0.0),
200
+ "risk_score": result["risk_score"],
201
+ "status": "oss_advisory_only",
202
  }
203
 
204
  response_data = {
205
+ "deprecation_notice": (
206
+ "This endpoint is deprecated. Use POST /api/v1/intents/evaluate "
207
+ "for the full Bayesian evaluation with CUDL decomposition."
208
+ ),
209
  "healing_intent": healing_intent,
210
  "causal_explanation": {
211
  "factual_outcome": causal_exp.factual_outcome,
 
213
  "effect": causal_exp.effect,
214
  "explanation_text": causal_exp.explanation_text,
215
  "is_model_based": causal_exp.is_model_based,
216
+ "warnings": causal_exp.warnings,
217
  },
218
  "utility_decision": {
219
  "best_action": optimal_action.value,
220
  "expected_utility": 0.5,
221
+ "explanation": (
222
+ "Decision based on Bayesian risk threshold > 0.5"
223
+ ),
224
+ },
225
  }
226
 
227
+ # ------------------------------------------------------------------
228
  # Asynchronous usage logging
229
+ # ------------------------------------------------------------------
230
  if tracker:
231
  record = UsageRecord(
232
  api_key=api_key,
233
  tier=tier,
234
  timestamp=time.time(),
235
  endpoint="/v1/incidents/evaluate",
236
+ request_body=event.model_dump(mode="json"),
237
  response=response_data,
238
  processing_ms=(time.time() - start_time) * 1000,
239
  )
240
  await tracker.increment_usage_async(record, background_tasks)
241
 
242
+ logger.warning(
243
+ "Deprecated endpoint /v1/incidents/evaluate called by key %s",
244
+ api_key[:8],
245
+ )
246
  return response_data
247
 
248
  except HTTPException:
249
  raise
250
+ except Exception as exc:
251
+ error_msg = str(exc)
 
252
  if tracker:
253
  record = UsageRecord(
254
  api_key=api_key,
255
  tier=tier,
256
  timestamp=time.time(),
257
  endpoint="/v1/incidents/evaluate",
258
+ request_body=event.model_dump(mode="json"),
259
  error=error_msg,
260
  processing_ms=(time.time() - start_time) * 1000,
261
  )
app/api/routes_memory.py CHANGED
@@ -11,7 +11,11 @@ async def memory_stats(request: Request):
11
  risk_engine = request.app.state.risk_engine
12
 
13
  # Check if memory exists and has the required method
14
- if hasattr(risk_engine, 'memory') and hasattr(risk_engine.memory, 'get_graph_stats'):
 
 
 
 
15
  stats = risk_engine.memory.get_graph_stats()
16
  return stats
17
  else:
 
11
  risk_engine = request.app.state.risk_engine
12
 
13
  # Check if memory exists and has the required method
14
+ if hasattr(
15
+ risk_engine,
16
+ 'memory') and hasattr(
17
+ risk_engine.memory,
18
+ 'get_graph_stats'):
19
  stats = risk_engine.memory.get_graph_stats()
20
  return stats
21
  else:
app/api/routes_payments.py CHANGED
@@ -4,11 +4,9 @@ Payment endpoints – Stripe Checkout integration.
4
 
5
  import os
6
  import stripe
7
- from fastapi import APIRouter, HTTPException, Request
8
  from pydantic import BaseModel
9
- from typing import Optional
10
 
11
- from app.core.config import settings
12
  from app.core.usage_tracker import tracker, Tier
13
 
14
  router = APIRouter(prefix="/payments", tags=["payments"])
@@ -17,8 +15,10 @@ router = APIRouter(prefix="/payments", tags=["payments"])
17
  stripe.api_key = os.getenv("STRIPE_SECRET_KEY")
18
  STRIPE_WEBHOOK_SECRET = os.getenv("STRIPE_WEBHOOK_SECRET")
19
 
 
20
  class CheckoutRequest(BaseModel):
21
  api_key: str
 
22
  success_url: str
23
  cancel_url: str
24
 
@@ -32,14 +32,16 @@ async def create_checkout_session(req: CheckoutRequest):
32
  # Verify the API key exists and is free tier
33
  tier = tracker.get_tier(req.api_key) if tracker else None
34
  if tier != Tier.FREE:
35
- raise HTTPException(status_code=400, detail="Only free tier keys can be upgraded")
 
36
 
37
  try:
38
  checkout_session = stripe.checkout.Session.create(
39
  payment_method_types=["card"],
40
  line_items=[
41
  {
42
- "price": os.getenv("STRIPE_PRO_PRICE_ID"), # e.g., "price_123"
 
43
  "quantity": 1,
44
  }
45
  ],
 
4
 
5
  import os
6
  import stripe
7
+ from fastapi import APIRouter, HTTPException
8
  from pydantic import BaseModel
 
9
 
 
10
  from app.core.usage_tracker import tracker, Tier
11
 
12
  router = APIRouter(prefix="/payments", tags=["payments"])
 
15
  stripe.api_key = os.getenv("STRIPE_SECRET_KEY")
16
  STRIPE_WEBHOOK_SECRET = os.getenv("STRIPE_WEBHOOK_SECRET")
17
 
18
+
19
  class CheckoutRequest(BaseModel):
20
  api_key: str
21
+
22
  success_url: str
23
  cancel_url: str
24
 
 
32
  # Verify the API key exists and is free tier
33
  tier = tracker.get_tier(req.api_key) if tracker else None
34
  if tier != Tier.FREE:
35
+ raise HTTPException(status_code=400,
36
+ detail="Only free tier keys can be upgraded")
37
 
38
  try:
39
  checkout_session = stripe.checkout.Session.create(
40
  payment_method_types=["card"],
41
  line_items=[
42
  {
43
+ # e.g., "price_123"
44
+ "price": os.getenv("STRIPE_PRO_PRICE_ID"),
45
  "quantity": 1,
46
  }
47
  ],
app/api/routes_pricing.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Pricing endpoints – integrates the ARF Bayesian pricing calculator.
3
+ """
4
+
5
+ from fastapi import APIRouter, HTTPException, Depends
6
+ from pydantic import BaseModel
7
+ import logging
8
+
9
+ from arf_pricing_calculator.core.pricing_engine import PricingEngine
10
+ from arf_pricing_calculator.ingestion.questionnaire_parser import parse_input_dict
11
+ from arf_pricing_calculator.types import PricingOutput
12
+ from app.core.usage_tracker import enforce_quota
13
+
14
+ logger = logging.getLogger(__name__)
15
+ router = APIRouter()
16
+
17
+
18
+ class PricingEstimateRequest(BaseModel):
19
+ """Request body for single pricing estimate."""
20
+ input: dict
21
+ customer_id: str = "default"
22
+ force: bool = False
23
+
24
+
25
+ class PricingRunRequest(BaseModel):
26
+ """Request body for multi‑run pricing with learning."""
27
+ input: dict
28
+ customer_id: str = "default"
29
+ runs: int = 1
30
+ cooldown_hours: int = 24
31
+ force: bool = False
32
+
33
+
34
+ @router.post("/pricing/estimate", response_model=PricingOutput)
35
+ async def estimate_pricing(
36
+ req: PricingEstimateRequest,
37
+ quota: dict = Depends(enforce_quota), # optional: enforce usage tracking
38
+ ):
39
+ """
40
+ Single pricing estimate – no learning, no buffer update.
41
+ """
42
+ try:
43
+ # Convert the input dict to a PricingInput object
44
+ pricing_input = parse_input_dict(req.input)
45
+ # Create engine without buffer (no learning)
46
+ engine = PricingEngine(calibration_buffer=[])
47
+ output = engine.estimate(pricing_input)
48
+ return output
49
+ except Exception as e:
50
+ logger.exception("Pricing estimate failed")
51
+ raise HTTPException(status_code=400, detail=str(e))
52
+
53
+
54
+ @router.post("/pricing/run", response_model=list[PricingOutput])
55
+ async def run_pricing(
56
+ req: PricingRunRequest,
57
+ quota: dict = Depends(enforce_quota),
58
+ ):
59
+ """
60
+ Multi‑run pricing with cooldown and buffer persistence.
61
+ Each run’s simulated outcome is added to the buffer, so subsequent runs
62
+ see an updated posterior.
63
+ """
64
+ # We need to reuse the same buffer across runs; we'll load it per request.
65
+ # For simplicity, we'll load from the default location.
66
+ from arf_pricing_calculator.storage.buffer import load_buffer, add_event
67
+ from arf_pricing_calculator.orchestration.cooldown import enforce_cooldown, is_cooldown_active
68
+
69
+ outputs = []
70
+ buffer = load_buffer() # loads from calibration_buffer.json
71
+
72
+ for i in range(req.runs):
73
+ if not req.force and is_cooldown_active(
74
+ req.customer_id, req.cooldown_hours):
75
+ raise HTTPException(status_code=429,
76
+ detail=f"Cooldown active after {i} runs")
77
+
78
+ pricing_input = parse_input_dict(req.input)
79
+ engine = PricingEngine(calibration_buffer=buffer)
80
+ out = engine.estimate(pricing_input)
81
+
82
+ # Simulate an outcome (in real use, this would come from the actual
83
+ # deal)
84
+ import random
85
+ outcome = "success" if random.random() > out.risk_score else "failure" # nosec B311
86
+
87
+ event = {
88
+ "run_id": out.run_history_id,
89
+ "customer_id": req.customer_id,
90
+ "outcome": outcome,
91
+ "price": out.recommended_price,
92
+ "value": out.expected_value,
93
+ "risk_score": out.risk_score,
94
+ "run_number": i + 1,
95
+ }
96
+ add_event(event)
97
+ buffer = load_buffer() # reload after update
98
+
99
+ outputs.append(out)
100
+
101
+ if i < req.runs - 1:
102
+ enforce_cooldown(req.customer_id, req.cooldown_hours)
103
+
104
+ return outputs
app/api/routes_risk.py CHANGED
@@ -9,32 +9,29 @@ router = APIRouter()
9
  async def get_risk():
10
  try:
11
  risk = get_system_risk()
12
- if risk < 0.3:
13
- status = "low"
14
- elif risk < 0.6:
15
- status = "moderate"
16
- elif risk < 0.8:
17
- status = "high"
18
- else:
19
- status = "critical"
20
- return RiskResponse(system_risk=risk, status=status)
21
  except Exception as e:
22
  raise HTTPException(status_code=500, detail=str(e))
23
 
 
 
 
 
 
 
 
 
 
 
24
 
25
  @router.get("/history")
26
  async def get_risk_history():
27
- """
28
- Return dummy historical risk data for the last 24 hours.
29
- Replace with real database query later.
30
- """
31
  import random
32
  import datetime
33
  now = datetime.datetime.now()
34
- data = []
35
- for i in range(24, 0, -1):
36
- data.append({
37
- "time": (now - datetime.timedelta(hours=i)).isoformat(),
38
- "risk": round(random.uniform(0.2, 0.8), 2)
39
- })
40
  return data
 
9
  async def get_risk():
10
  try:
11
  risk = get_system_risk()
12
+ except NotImplementedError:
13
+ raise HTTPException(
14
+ status_code=501,
15
+ detail="This endpoint is deprecated and not implemented")
 
 
 
 
 
16
  except Exception as e:
17
  raise HTTPException(status_code=500, detail=str(e))
18
 
19
+ if risk < 0.3:
20
+ status = "low"
21
+ elif risk < 0.6:
22
+ status = "moderate"
23
+ elif risk < 0.8:
24
+ status = "high"
25
+ else:
26
+ status = "critical"
27
+ return RiskResponse(system_risk=risk, status=status)
28
+
29
 
30
  @router.get("/history")
31
  async def get_risk_history():
 
 
 
 
32
  import random
33
  import datetime
34
  now = datetime.datetime.now()
35
+ data = [{"time": (now - datetime.timedelta(hours=i)).isoformat(),
36
+ "risk": round(random.uniform(0.2, 0.8), 2)} for i in range(24, 0, -1)]
 
 
 
 
37
  return data
app/api/routes_users.py CHANGED
@@ -3,7 +3,6 @@ User endpoints – registration and quota information.
3
  """
4
 
5
  import uuid
6
- import os
7
  from fastapi import APIRouter, Depends, HTTPException, Request
8
  from slowapi import Limiter
9
  from slowapi.util import get_remote_address
@@ -23,7 +22,9 @@ async def register_user(request: Request):
23
  Rate‑limited to 5 requests per hour per IP address.
24
  """
25
  if tracker is None:
26
- raise HTTPException(status_code=503, detail="Usage tracking not available")
 
 
27
 
28
  # Generate a new API key
29
  new_key = f"sk_free_{uuid.uuid4().hex[:24]}"
@@ -36,12 +37,13 @@ async def register_user(request: Request):
36
  return {
37
  "api_key": new_key,
38
  "tier": "free",
39
- "message": "API key created. Store it securely – you won't see it again."
40
- }
41
 
42
 
43
  @router.get("/quota")
44
- async def get_user_quota(request: Request, quota: dict = Depends(enforce_quota)):
 
 
45
  """
46
  Return the current user's tier and remaining evaluation quota.
47
  Requires API key in Authorization header.
@@ -55,17 +57,3 @@ async def get_user_quota(request: Request, quota: dict = Depends(enforce_quota))
55
  "remaining": remaining,
56
  "limit": limit,
57
  }
58
-
59
-
60
- # ===== DEBUG ENDPOINT – Remove in production =====
61
- @router.get("/tracker-status")
62
- async def tracker_status():
63
- """
64
- Debug endpoint to check if the usage tracker is initialised.
65
- Returns the tracker object and environment variables.
66
- """
67
- return {
68
- "tracker": str(tracker),
69
- "env_tracking": os.getenv("ARF_USAGE_TRACKING"),
70
- "env_db_path": os.getenv("ARF_USAGE_DB_PATH")
71
- }
 
3
  """
4
 
5
  import uuid
 
6
  from fastapi import APIRouter, Depends, HTTPException, Request
7
  from slowapi import Limiter
8
  from slowapi.util import get_remote_address
 
22
  Rate‑limited to 5 requests per hour per IP address.
23
  """
24
  if tracker is None:
25
+ raise HTTPException(
26
+ status_code=503,
27
+ detail="Usage tracking not available")
28
 
29
  # Generate a new API key
30
  new_key = f"sk_free_{uuid.uuid4().hex[:24]}"
 
37
  return {
38
  "api_key": new_key,
39
  "tier": "free",
40
+ "message": "API key created. Store it securely – you won't see it again."}
 
41
 
42
 
43
  @router.get("/quota")
44
+ async def get_user_quota(
45
+ request: Request,
46
+ quota: dict = Depends(enforce_quota)):
47
  """
48
  Return the current user's tier and remaining evaluation quota.
49
  Requires API key in Authorization header.
 
57
  "remaining": remaining,
58
  "limit": limit,
59
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app/api/webhooks.py CHANGED
@@ -33,7 +33,8 @@ async def stripe_webhook(request: Request):
33
  # Handle subscription events
34
  if event["type"] == "checkout.session.completed":
35
  session = event["data"]["object"]
36
- api_key = session.get("client_reference_id") or session.get("metadata", {}).get("api_key")
 
37
  if api_key:
38
  update_key_tier(api_key, Tier.PRO)
39
  elif event["type"] == "customer.subscription.deleted":
 
33
  # Handle subscription events
34
  if event["type"] == "checkout.session.completed":
35
  session = event["data"]["object"]
36
+ api_key = session.get("client_reference_id") or session.get(
37
+ "metadata", {}).get("api_key")
38
  if api_key:
39
  update_key_tier(api_key, Tier.PRO)
40
  elif event["type"] == "customer.subscription.deleted":
app/core/config.py CHANGED
@@ -15,6 +15,9 @@ class Settings(BaseSettings):
15
  ARF_REDIS_URL: Optional[str] = None
16
  ARF_API_KEYS: str = "{}" # JSON string of {key: tier}
17
 
 
 
 
18
  class Config:
19
  env_file = ".env"
20
  extra = "ignore"
 
15
  ARF_REDIS_URL: Optional[str] = None
16
  ARF_API_KEYS: str = "{}" # JSON string of {key: tier}
17
 
18
+ # Tracing (OpenTelemetry)
19
+ OTEL_EXPORTER_OTLP_ENDPOINT: Optional[str] = None
20
+
21
  class Config:
22
  env_file = ".env"
23
  extra = "ignore"
app/core/usage_tracker.py CHANGED
@@ -1,19 +1,18 @@
1
  """
2
  Usage Tracker for ARF API – quotas, tiers, and audit logging.
3
- Noninvasive, configurable, thread‑safe, and backgroundtask ready.
4
  """
5
 
6
- import os
7
  import json
8
  import sqlite3
9
  import threading
10
  import time
11
  from contextlib import contextmanager
12
  from datetime import datetime, timedelta
13
- from typing import Dict, Any, Optional, List
14
- from enum import Enum
15
  from dataclasses import dataclass
16
- from fastapi import BackgroundTasks
 
 
17
 
18
  # Optional Redis support
19
  try:
@@ -66,10 +65,11 @@ class UsageRecord:
66
 
67
  class UsageTracker:
68
  """
69
- Thread‑safe usage tracker with SQLite storage and optional Redis for counters.
70
  """
71
 
72
- def __init__(self, db_path: str = "arf_usage.db", redis_url: Optional[str] = None):
 
73
  self.db_path = db_path
74
  self._local = threading.local()
75
  self._init_db()
@@ -78,14 +78,17 @@ class UsageTracker:
78
  if redis_url and REDIS_AVAILABLE:
79
  self._redis_client = redis.from_url(redis_url)
80
  elif redis_url:
81
- raise ImportError("Redis client not installed. Run: pip install redis")
 
82
 
83
  @contextmanager
84
  def _get_conn(self):
85
- """Get a thread‑local SQLite connection."""
86
  if not hasattr(self._local, "conn"):
87
- self._local.conn = sqlite3.connect(self.db_path, check_same_thread=False)
 
88
  self._local.conn.row_factory = sqlite3.Row
 
89
  yield self._local.conn
90
 
91
  def _init_db(self):
@@ -109,7 +112,8 @@ class UsageTracker:
109
  request_body TEXT,
110
  response TEXT,
111
  error TEXT,
112
- processing_ms REAL
 
113
  )
114
  """)
115
  conn.execute("""
@@ -124,6 +128,12 @@ class UsageTracker:
124
  PRIMARY KEY (api_key, year_month)
125
  )
126
  """)
 
 
 
 
 
 
127
  conn.commit()
128
 
129
  def _get_month_key(self) -> str:
@@ -132,7 +142,8 @@ class UsageTracker:
132
  def get_or_create_api_key(self, key: str, tier: Tier = Tier.FREE) -> bool:
133
  """Register a new API key. Returns True if key exists or was created."""
134
  with self._get_conn() as conn:
135
- row = conn.execute("SELECT key FROM api_keys WHERE key = ?", (key,)).fetchone()
 
136
  if row:
137
  return True
138
  conn.execute(
@@ -156,45 +167,56 @@ class UsageTracker:
156
  def update_api_key_tier(self, api_key: str, new_tier: Tier) -> bool:
157
  """Update the tier of an existing API key. Returns True if successful."""
158
  with self._get_conn() as conn:
159
- row = conn.execute("SELECT key FROM api_keys WHERE key = ?", (api_key,)).fetchone()
 
160
  if not row:
161
  return False
162
- conn.execute("UPDATE api_keys SET tier = ? WHERE key = ?", (new_tier.value, api_key))
 
 
 
163
  conn.commit()
164
  return True
165
 
166
- def get_remaining_quota(self, api_key: str, tier: Tier) -> Optional[int]:
167
- """Return remaining evaluations for the month, or None if unlimited."""
 
 
 
 
 
 
 
 
 
 
168
  limit = tier.monthly_evaluation_limit
169
  if limit is None:
170
- return None
171
-
172
- month = self._get_month_key()
173
- if self._redis_client:
174
- redis_key = f"arf:quota:{api_key}:{month}"
175
- count = int(self._redis_client.get(redis_key) or 0)
176
- return max(0, limit - count)
 
 
 
177
 
 
178
  with self._get_conn() as conn:
179
- row = conn.execute(
180
- "SELECT count FROM monthly_counts WHERE api_key = ? AND year_month = ?",
181
- (api_key, month)
182
- ).fetchone()
183
- count = row["count"] if row else 0
184
- return max(0, limit - count)
185
-
186
- def _increment_quota(self, api_key: str, tier: Tier) -> None:
187
- """Increment the monthly counter (internal, synchronous)."""
188
- limit = tier.monthly_evaluation_limit
189
- if limit is None:
190
- return
191
- month = self._get_month_key()
192
- if self._redis_client:
193
- redis_key = f"arf:quota:{api_key}:{month}"
194
- self._redis_client.incr(redis_key)
195
- self._redis_client.expire(redis_key, timedelta(days=31))
196
- else:
197
- with self._get_conn() as conn:
198
  conn.execute(
199
  """INSERT INTO monthly_counts (api_key, year_month, count)
200
  VALUES (?, ?, 1)
@@ -202,58 +224,190 @@ class UsageTracker:
202
  (api_key, month)
203
  )
204
  conn.commit()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
205
 
206
- def _insert_audit_log(self, record: UsageRecord) -> None:
207
- """Insert a single audit log (internal, synchronous)."""
208
  with self._get_conn() as conn:
209
  conn.execute(
210
- """INSERT INTO usage_log
211
- (api_key, tier, timestamp, endpoint, request_body, response, error, processing_ms)
212
- VALUES (?, ?, ?, ?, ?, ?, ?, ?)""",
213
- (
214
- record.api_key,
215
- record.tier.value,
216
- record.timestamp,
217
- record.endpoint,
218
- json.dumps(record.request_body) if record.request_body else None,
219
- json.dumps(record.response) if record.response else None,
220
- record.error,
221
- record.processing_ms,
222
- )
223
  )
224
  conn.commit()
 
 
225
 
226
- def increment_usage_sync(self, record: UsageRecord) -> bool:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
227
  """
228
  Synchronously record usage and increment counter.
229
- Returns True if within quota (i.e., counter was incremented), False if quota exceeded.
 
230
  """
231
- tier = record.tier
232
- limit = tier.monthly_evaluation_limit
233
- if limit is not None:
234
- remaining = self.get_remaining_quota(record.api_key, tier)
235
- if remaining <= 0:
236
- return False
237
- self._increment_quota(record.api_key, tier)
238
- self._insert_audit_log(record)
239
- return True
240
 
241
- async def increment_usage_async(self, record: UsageRecord, background_tasks: BackgroundTasks) -> bool:
 
 
 
 
 
242
  """
243
  Asynchronously record usage using FastAPI BackgroundTasks.
244
- Returns True if quota allows (i.e., will be recorded), False if quota exceeded.
245
  """
246
- tier = record.tier
 
 
 
 
 
 
 
 
 
 
 
 
247
  limit = tier.monthly_evaluation_limit
248
- if limit is not None:
249
- remaining = self.get_remaining_quota(record.api_key, tier)
250
- if remaining <= 0:
251
- return False
252
- # Schedule the actual write in the background
253
- background_tasks.add_task(self._increment_quota, record.api_key, tier)
254
- background_tasks.add_task(self._insert_audit_log, record)
255
- return True
256
 
 
 
 
 
 
 
 
 
 
 
 
257
  def get_audit_logs(
258
  self,
259
  api_key: str,
@@ -278,8 +432,9 @@ class UsageTracker:
278
  return [dict(row) for row in rows]
279
 
280
  def clean_old_logs(self):
281
- """Delete logs older than retention period for each tier."""
282
  with self._get_conn() as conn:
 
283
  for tier in Tier:
284
  retention_days = tier.audit_log_retention_days
285
  if retention_days is None:
@@ -289,14 +444,23 @@ class UsageTracker:
289
  "DELETE FROM usage_log WHERE tier = ? AND timestamp < ?",
290
  (tier.value, cutoff)
291
  )
 
 
 
 
292
  conn.commit()
293
 
294
 
295
- # Global instance
 
 
296
  tracker: Optional[UsageTracker] = None
297
 
298
 
299
- def init_tracker(db_path: str = "arf_usage.db", redis_url: Optional[str] = None):
 
 
 
300
  global tracker
301
  tracker = UsageTracker(db_path, redis_url)
302
 
@@ -308,19 +472,16 @@ def update_key_tier(api_key: str, new_tier: Tier) -> bool:
308
  return tracker.update_api_key_tier(api_key, new_tier)
309
 
310
 
311
- # FastAPI dependency to enforce quota
312
- from fastapi import HTTPException, Request
313
-
314
  async def enforce_quota(request: Request, api_key: str = None):
315
  """
316
  Dependency that checks API key and remaining quota.
317
- Use in your endpoint: `quota = Depends(enforce_quota)`
318
-
319
- If usage tracking is disabled, returns a default dict (no enforcement).
320
  """
321
- # If tracker not initialised, allow all requests (fallback)
322
  if tracker is None:
323
- return {"api_key": api_key or "disabled", "tier": Tier.FREE, "remaining": None}
 
 
324
 
325
  # Extract API key from header or query
326
  if api_key is None:
@@ -335,13 +496,16 @@ async def enforce_quota(request: Request, api_key: str = None):
335
 
336
  tier = tracker.get_tier(api_key)
337
  if tier is None:
338
- raise HTTPException(status_code=403, detail="Invalid or inactive API key")
 
 
339
 
340
  remaining = tracker.get_remaining_quota(api_key, tier)
341
  if remaining is not None and remaining <= 0:
342
- raise HTTPException(status_code=429, detail="Monthly evaluation quota exceeded")
 
343
 
344
- # Store in request state for later logging
345
  request.state.api_key = api_key
346
  request.state.tier = tier
347
  return {"api_key": api_key, "tier": tier, "remaining": remaining}
 
1
  """
2
  Usage Tracker for ARF API – quotas, tiers, and audit logging.
3
+ Threadsafe, atomic quota consumption, idempotent, failclosed.
4
  """
5
 
 
6
  import json
7
  import sqlite3
8
  import threading
9
  import time
10
  from contextlib import contextmanager
11
  from datetime import datetime, timedelta
 
 
12
  from dataclasses import dataclass
13
+ from typing import Dict, Any, Optional, List, Tuple
14
+ from enum import Enum
15
+ from fastapi import BackgroundTasks, HTTPException, Request
16
 
17
  # Optional Redis support
18
  try:
 
65
 
66
  class UsageTracker:
67
  """
68
+ Thread‑safe usage tracker with atomic quota consumption and idempotency.
69
  """
70
 
71
+ def __init__(self, db_path: str = "arf_usage.db",
72
+ redis_url: Optional[str] = None):
73
  self.db_path = db_path
74
  self._local = threading.local()
75
  self._init_db()
 
78
  if redis_url and REDIS_AVAILABLE:
79
  self._redis_client = redis.from_url(redis_url)
80
  elif redis_url:
81
+ raise ImportError(
82
+ "Redis client not installed. Run: pip install redis")
83
 
84
  @contextmanager
85
  def _get_conn(self):
86
+ """Get a thread‑local SQLite connection with write‑ahead logging and immediate transactions."""
87
  if not hasattr(self._local, "conn"):
88
+ self._local.conn = sqlite3.connect(
89
+ self.db_path, check_same_thread=False, isolation_level=None)
90
  self._local.conn.row_factory = sqlite3.Row
91
+ self._local.conn.execute("PRAGMA journal_mode=WAL")
92
  yield self._local.conn
93
 
94
  def _init_db(self):
 
112
  request_body TEXT,
113
  response TEXT,
114
  error TEXT,
115
+ processing_ms REAL,
116
+ idempotency_key TEXT UNIQUE
117
  )
118
  """)
119
  conn.execute("""
 
128
  PRIMARY KEY (api_key, year_month)
129
  )
130
  """)
131
+ conn.execute("""
132
+ CREATE TABLE IF NOT EXISTS idempotency_keys (
133
+ key TEXT PRIMARY KEY,
134
+ consumed_at REAL NOT NULL
135
+ )
136
+ """)
137
  conn.commit()
138
 
139
  def _get_month_key(self) -> str:
 
142
  def get_or_create_api_key(self, key: str, tier: Tier = Tier.FREE) -> bool:
143
  """Register a new API key. Returns True if key exists or was created."""
144
  with self._get_conn() as conn:
145
+ row = conn.execute(
146
+ "SELECT key FROM api_keys WHERE key = ?", (key,)).fetchone()
147
  if row:
148
  return True
149
  conn.execute(
 
167
  def update_api_key_tier(self, api_key: str, new_tier: Tier) -> bool:
168
  """Update the tier of an existing API key. Returns True if successful."""
169
  with self._get_conn() as conn:
170
+ row = conn.execute(
171
+ "SELECT key FROM api_keys WHERE key = ?", (api_key,)).fetchone()
172
  if not row:
173
  return False
174
+ conn.execute(
175
+ "UPDATE api_keys SET tier = ? WHERE key = ?",
176
+ (new_tier.value,
177
+ api_key))
178
  conn.commit()
179
  return True
180
 
181
+ # --------------------------------------------------------------------------
182
+ # Atomic quota consumption
183
+ # --------------------------------------------------------------------------
184
+ def _consume_quota_atomic_sqlite(
185
+ self,
186
+ api_key: str,
187
+ tier: Tier,
188
+ month: str) -> bool: # noqa: E501
189
+ """
190
+ Atomically increment counter only if under limit.
191
+ Returns True if quota was consumed, False if limit reached.
192
+ """
193
  limit = tier.monthly_evaluation_limit
194
  if limit is None:
195
+ # Unlimited – still increment for tracking but always succeed
196
+ with self._get_conn() as conn:
197
+ conn.execute(
198
+ """INSERT INTO monthly_counts (api_key, year_month, count)
199
+ VALUES (?, ?, 1)
200
+ ON CONFLICT(api_key, year_month) DO UPDATE SET count = count + 1""",
201
+ (api_key, month)
202
+ )
203
+ conn.commit()
204
+ return True
205
 
206
+ # Use BEGIN IMMEDIATE to lock the database for the transaction
207
  with self._get_conn() as conn:
208
+ conn.execute("BEGIN IMMEDIATE")
209
+ try:
210
+ # Get current count (or 0)
211
+ row = conn.execute(
212
+ "SELECT count FROM monthly_counts WHERE api_key = ? AND year_month = ?",
213
+ (api_key, month)
214
+ ).fetchone()
215
+ current = row["count"] if row else 0
216
+ if current >= limit:
217
+ conn.rollback()
218
+ return False
219
+ # Increment
 
 
 
 
 
 
 
220
  conn.execute(
221
  """INSERT INTO monthly_counts (api_key, year_month, count)
222
  VALUES (?, ?, 1)
 
224
  (api_key, month)
225
  )
226
  conn.commit()
227
+ return True
228
+ except Exception:
229
+ conn.rollback()
230
+ raise
231
+
232
+ def _consume_quota_atomic_redis(
233
+ self,
234
+ api_key: str,
235
+ tier: Tier,
236
+ month: str) -> bool:
237
+ """Atomic Lua script for Redis: INCR only if below limit."""
238
+ limit = tier.monthly_evaluation_limit
239
+ if limit is None:
240
+ # Unlimited – just increment and return True
241
+ redis_key = f"arf:quota:{api_key}:{month}"
242
+ self._redis_client.incr(redis_key)
243
+ self._redis_client.expire(redis_key, timedelta(days=31))
244
+ return True
245
+
246
+ lua_script = """
247
+ local key = KEYS[1]
248
+ local limit = tonumber(ARGV[1])
249
+ local current = redis.call('GET', key)
250
+ if current and tonumber(current) >= limit then
251
+ return 0
252
+ end
253
+ local new = redis.call('INCR', key)
254
+ redis.call('EXPIRE', key, 2678400) -- 31 days
255
+ return 1
256
+ """
257
+ redis_key = f"arf:quota:{api_key}:{month}"
258
+ result = self._redis_client.eval(lua_script, 1, redis_key, limit)
259
+ return result == 1
260
+
261
+ # --------------------------------------------------------------------------
262
+ # Idempotency handling
263
+ # --------------------------------------------------------------------------
264
+ def _is_idempotent_key_used(self, key: str) -> bool:
265
+ """Check if idempotency key already processed."""
266
+ with self._get_conn() as conn:
267
+ row = conn.execute(
268
+ "SELECT 1 FROM idempotency_keys WHERE key = ?", (key,)).fetchone()
269
+ return row is not None
270
 
271
+ def _mark_idempotent_key_used(self, key: str, ttl_seconds: int = 86400):
272
+ """Store idempotency key with expiration (cleanup later)."""
273
  with self._get_conn() as conn:
274
  conn.execute(
275
+ "INSERT INTO idempotency_keys (key, consumed_at) VALUES (?, ?)",
276
+ (key, time.time())
 
 
 
 
 
 
 
 
 
 
 
277
  )
278
  conn.commit()
279
+ # Optionally schedule cleanup of old keys (can be done in a background
280
+ # thread)
281
 
282
+ # --------------------------------------------------------------------------
283
+ # Core usage recording (atomic + idempotent)
284
+ # --------------------------------------------------------------------------
285
+ def consume_quota_and_log(
286
+ self,
287
+ record: UsageRecord,
288
+ idempotency_key: Optional[str] = None,
289
+ ) -> Tuple[bool, Optional[Dict[str, Any]]]:
290
+ """
291
+ Atomically consume quota and insert audit log.
292
+ Returns (success, existing_response) where existing_response is not None
293
+ only when idempotency_key matched a previous successful call.
294
+ """
295
+ # Idempotency check (if key provided)
296
+ if idempotency_key:
297
+ if self._is_idempotent_key_used(idempotency_key):
298
+ # Retrieve previous response from audit log (simplified – you may cache full response)
299
+ # For full idempotency, we would store the response body in idempotency table.
300
+ # Here we return a marker that caller should use cached
301
+ # response.
302
+ return False, {"idempotent": True,
303
+ "message": "Already processed"}
304
+
305
+ month = self._get_month_key()
306
+ # Atomic quota consumption
307
+ if self._redis_client:
308
+ quota_ok = self._consume_quota_atomic_redis(
309
+ record.api_key, record.tier, month)
310
+ else:
311
+ quota_ok = self._consume_quota_atomic_sqlite(
312
+ record.api_key, record.tier, month)
313
+
314
+ if not quota_ok:
315
+ return False, None
316
+
317
+ # Insert audit log (with idempotency key as unique constraint)
318
+ try:
319
+ with self._get_conn() as conn:
320
+ conn.execute(
321
+ """INSERT INTO usage_log
322
+ (api_key, tier, timestamp, endpoint,
323
+ request_body, response, error, processing_ms,
324
+ idempotency_key)
325
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)""",
326
+ (record.api_key,
327
+ record.tier.value,
328
+ record.timestamp,
329
+ record.endpoint,
330
+ json.dumps(
331
+ record.request_body) if record.request_body else None,
332
+ json.dumps(
333
+ record.response) if record.response else None,
334
+ record.error,
335
+ record.processing_ms,
336
+ idempotency_key,
337
+ ))
338
+ conn.commit()
339
+ except sqlite3.IntegrityError as e:
340
+ # Duplicate idempotency_key – already inserted by another
341
+ # concurrent request
342
+ if "UNIQUE constraint failed: usage_log.idempotency_key" in str(e):
343
+ return False, {"idempotent": True,
344
+ "message": "Already processed"}
345
+ raise
346
+
347
+ if idempotency_key:
348
+ self._mark_idempotent_key_used(idempotency_key)
349
+ # Removed stray # noqa: E501 comment that was wrongly indented here
350
+ return True, None
351
+
352
+ # --------------------------------------------------------------------------
353
+ # Legacy interface (kept for compatibility but deprecated)
354
+ # --------------------------------------------------------------------------
355
+ def increment_usage_sync(
356
+ self,
357
+ record: UsageRecord,
358
+ idempotency_key: Optional[str] = None) -> bool:
359
  """
360
  Synchronously record usage and increment counter.
361
+ Returns True if within quota and recorded, False otherwise.
362
+ This method now uses the atomic implementation.
363
  """
364
+ success, _ = self.consume_quota_and_log(record, idempotency_key)
365
+ return success
 
 
 
 
 
 
 
366
 
367
+ async def increment_usage_async(
368
+ self,
369
+ record: UsageRecord,
370
+ background_tasks: BackgroundTasks,
371
+ idempotency_key: Optional[str] = None
372
+ ) -> bool:
373
  """
374
  Asynchronously record usage using FastAPI BackgroundTasks.
375
+ Still does the atomic check synchronously, then schedules the insert.
376
  """
377
+ # First, do atomic quota check (synchronous) – we must ensure we don't double-consume.
378
+ # Because background tasks may run later, we still need to reserve quota now.
379
+ # Simplified: we call consume_quota_and_log synchronously – that defeats async benefit.
380
+ # Better to use a queue or Redis with background processing.
381
+ # For this fix, we'll use the sync method (blocking) but still support
382
+ # idempotency.
383
+ return self.increment_usage_sync(record, idempotency_key)
384
+
385
+ # --------------------------------------------------------------------------
386
+ # Quota inspection (non‑atomic, for display only)
387
+ # --------------------------------------------------------------------------
388
+ def get_remaining_quota(self, api_key: str, tier: Tier) -> Optional[int]:
389
+ """Return remaining evaluations for the month (non‑atomic, for info only)."""
390
  limit = tier.monthly_evaluation_limit
391
+ if limit is None:
392
+ return None
393
+
394
+ month = self._get_month_key()
395
+ if self._redis_client:
396
+ redis_key = f"arf:quota:{api_key}:{month}"
397
+ count = int(self._redis_client.get(redis_key) or 0)
398
+ return max(0, limit - count)
399
 
400
+ with self._get_conn() as conn:
401
+ row = conn.execute(
402
+ "SELECT count FROM monthly_counts WHERE api_key = ? AND year_month = ?",
403
+ (api_key, month)
404
+ ).fetchone()
405
+ count = row["count"] if row else 0
406
+ return max(0, limit - count)
407
+
408
+ # --------------------------------------------------------------------------
409
+ # Audit and maintenance
410
+ # --------------------------------------------------------------------------
411
  def get_audit_logs(
412
  self,
413
  api_key: str,
 
432
  return [dict(row) for row in rows]
433
 
434
  def clean_old_logs(self):
435
+ """Delete logs older than retention period for each tier, and old idempotency keys."""
436
  with self._get_conn() as conn:
437
+ # Delete old usage logs
438
  for tier in Tier:
439
  retention_days = tier.audit_log_retention_days
440
  if retention_days is None:
 
444
  "DELETE FROM usage_log WHERE tier = ? AND timestamp < ?",
445
  (tier.value, cutoff)
446
  )
447
+ # Delete idempotency keys older than 7 days
448
+ cutoff = time.time() - 7 * 86400
449
+ conn.execute(
450
+ "DELETE FROM idempotency_keys WHERE consumed_at < ?", (cutoff,))
451
  conn.commit()
452
 
453
 
454
+ # --------------------------------------------------------------------------
455
+ # Global instance and FastAPI dependency (fail‑closed)
456
+ # --------------------------------------------------------------------------
457
  tracker: Optional[UsageTracker] = None
458
 
459
 
460
+ def init_tracker(
461
+ db_path: str = "arf_usage.db",
462
+ redis_url: Optional[str] = None):
463
+ """Initialize the global tracker. Must be called before enforce_quota."""
464
  global tracker
465
  tracker = UsageTracker(db_path, redis_url)
466
 
 
472
  return tracker.update_api_key_tier(api_key, new_tier)
473
 
474
 
 
 
 
475
  async def enforce_quota(request: Request, api_key: str = None):
476
  """
477
  Dependency that checks API key and remaining quota.
478
+ FAILS CLOSED: if tracker not initialised, raises HTTP 503.
 
 
479
  """
480
+ # P0 fix: No fallback that allows all requests
481
  if tracker is None:
482
+ raise HTTPException(
483
+ status_code=503,
484
+ detail="Usage tracking service not initialised. Please contact administrator.")
485
 
486
  # Extract API key from header or query
487
  if api_key is None:
 
496
 
497
  tier = tracker.get_tier(api_key)
498
  if tier is None:
499
+ raise HTTPException(
500
+ status_code=403,
501
+ detail="Invalid or inactive API key")
502
 
503
  remaining = tracker.get_remaining_quota(api_key, tier)
504
  if remaining is not None and remaining <= 0:
505
+ raise HTTPException(status_code=429,
506
+ detail="Monthly evaluation quota exceeded")
507
 
508
+ # Store in request state for later logging (optional)
509
  request.state.api_key = api_key
510
  request.state.tier = tier
511
  return {"api_key": api_key, "tier": tier, "remaining": remaining}
app/database/models_intents.py CHANGED
@@ -1,4 +1,4 @@
1
- from sqlalchemy import Column, Integer, String, DateTime, Boolean, Text, JSON, ForeignKey, UniqueConstraint
2
  from sqlalchemy.orm import relationship
3
  import datetime
4
  from .base import Base
@@ -7,27 +7,69 @@ from .base import Base
7
  class IntentDB(Base):
8
  __tablename__ = "intents"
9
  id = Column(Integer, primary_key=True, index=True)
10
- deterministic_id = Column(String(64), unique=True, index=True, nullable=False)
 
 
 
 
11
  intent_type = Column(String(64), nullable=False)
12
  payload = Column(JSON, nullable=False)
13
  oss_payload = Column(JSON, nullable=True)
14
  environment = Column(String(32), nullable=True)
15
- created_at = Column(DateTime, default=datetime.datetime.utcnow, nullable=False)
 
 
 
16
  evaluated_at = Column(DateTime, nullable=True)
17
  risk_score = Column(String(32), nullable=True)
18
- outcomes = relationship("OutcomeDB", back_populates="intent", cascade="all, delete-orphan")
 
 
 
19
 
20
 
21
  class OutcomeDB(Base):
22
  __tablename__ = "intent_outcomes"
23
  id = Column(Integer, primary_key=True, index=True)
24
- intent_id = Column(Integer, ForeignKey("intents.id", ondelete="CASCADE"), nullable=False)
 
 
 
 
 
25
  success = Column(Boolean, nullable=False)
26
  recorded_by = Column(String(128), nullable=True)
27
  notes = Column(Text, nullable=True)
28
- recorded_at = Column(DateTime, default=datetime.datetime.utcnow, nullable=False)
 
 
 
 
29
  intent = relationship("IntentDB", back_populates="outcomes")
30
 
31
  __table_args__ = (
32
  UniqueConstraint("intent_id", name="uq_outcome_intentid"),
33
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sqlalchemy import Column, Integer, String, DateTime, Boolean, Text, JSON, Float, ForeignKey, UniqueConstraint
2
  from sqlalchemy.orm import relationship
3
  import datetime
4
  from .base import Base
 
7
  class IntentDB(Base):
8
  __tablename__ = "intents"
9
  id = Column(Integer, primary_key=True, index=True)
10
+ deterministic_id = Column(
11
+ String(64),
12
+ unique=True,
13
+ index=True,
14
+ nullable=False)
15
  intent_type = Column(String(64), nullable=False)
16
  payload = Column(JSON, nullable=False)
17
  oss_payload = Column(JSON, nullable=True)
18
  environment = Column(String(32), nullable=True)
19
+ created_at = Column(
20
+ DateTime,
21
+ default=datetime.datetime.utcnow,
22
+ nullable=False)
23
  evaluated_at = Column(DateTime, nullable=True)
24
  risk_score = Column(String(32), nullable=True)
25
+ outcomes = relationship(
26
+ "OutcomeDB",
27
+ back_populates="intent",
28
+ cascade="all, delete-orphan")
29
 
30
 
31
  class OutcomeDB(Base):
32
  __tablename__ = "intent_outcomes"
33
  id = Column(Integer, primary_key=True, index=True)
34
+ intent_id = Column(
35
+ Integer,
36
+ ForeignKey(
37
+ "intents.id",
38
+ ondelete="CASCADE"),
39
+ nullable=False)
40
  success = Column(Boolean, nullable=False)
41
  recorded_by = Column(String(128), nullable=True)
42
  notes = Column(Text, nullable=True)
43
+ recorded_at = Column(
44
+ DateTime,
45
+ default=datetime.datetime.utcnow,
46
+ nullable=False)
47
+ idempotency_key = Column(String(128), unique=True, nullable=True)
48
  intent = relationship("IntentDB", back_populates="outcomes")
49
 
50
  __table_args__ = (
51
  UniqueConstraint("intent_id", name="uq_outcome_intentid"),
52
  )
53
+
54
+
55
+ # ---------------------------------------------------------------------------
56
+ # NEW: Persistence for the conjugate Bayesian state
57
+ # ---------------------------------------------------------------------------
58
+ class BetaStateDB(Base):
59
+ """
60
+ Stores the per‑category posterior parameters (α, β) of the BetaStore
61
+ so that online learning survives API restarts.
62
+
63
+ Only one row per ActionCategory is expected; the 'category' column is
64
+ unique. Updates are performed via merge / upsert.
65
+ """
66
+ __tablename__ = "beta_state"
67
+
68
+ id = Column(Integer, primary_key=True, index=True)
69
+ category = Column(String(32), unique=True, nullable=False, index=True)
70
+ alpha = Column(Float, nullable=False)
71
+ beta = Column(Float, nullable=False)
72
+ updated_at = Column(
73
+ DateTime,
74
+ default=datetime.datetime.utcnow,
75
+ onupdate=datetime.datetime.utcnow)
app/database/session.py CHANGED
@@ -1,19 +1,6 @@
1
  from sqlalchemy import create_engine
2
- from sqlalchemy.ext.declarative import declarative_base
3
  from sqlalchemy.orm import sessionmaker
4
  from app.core.config import settings
5
 
6
- # Use a default SQLite database if no URL is provided
7
- if settings.database_url:
8
- DATABASE_URL = settings.database_url
9
- else:
10
- # Fallback to a local SQLite file (writable in the container)
11
- DATABASE_URL = "sqlite:///./app.db"
12
-
13
- # For SQLite, we need to disable the threading check
14
- connect_args = {"check_same_thread": False} if DATABASE_URL.startswith("sqlite") else {}
15
-
16
- engine = create_engine(DATABASE_URL, connect_args=connect_args)
17
  SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
18
-
19
- Base = declarative_base()
 
1
  from sqlalchemy import create_engine
 
2
  from sqlalchemy.orm import sessionmaker
3
  from app.core.config import settings
4
 
5
+ engine = create_engine(settings.database_url)
 
 
 
 
 
 
 
 
 
 
6
  SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
 
 
app/main.py CHANGED
@@ -1,18 +1,42 @@
1
  """
2
- ARF API Control Plane - Main Application Entry Point
3
- With optional heavy dependencies and usage tracking.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  """
5
  import logging
6
  import os
7
  import sys
8
  import json
 
 
9
  from contextlib import asynccontextmanager
10
  from typing import Dict
11
 
12
  from fastapi import FastAPI
13
  from fastapi.middleware.cors import CORSMiddleware
14
 
15
- # Optional prometheus
16
  try:
17
  from prometheus_fastapi_instrumentator import Instrumentator
18
  PROMETHEUS_AVAILABLE = True
@@ -20,7 +44,7 @@ except ImportError:
20
  PROMETHEUS_AVAILABLE = False
21
  Instrumentator = None
22
 
23
- # Optional slowapi
24
  try:
25
  from slowapi import _rate_limit_exceeded_handler
26
  from slowapi.errors import RateLimitExceeded
@@ -32,7 +56,7 @@ except ImportError:
32
  RateLimitExceeded = None
33
  SlowAPIMiddleware = None
34
 
35
- # Optional agentic_reliability_framework (risk engine, policy engine, etc.)
36
  try:
37
  from agentic_reliability_framework.core.governance.risk_engine import RiskEngine
38
  from agentic_reliability_framework.core.governance.policy_engine import PolicyEngine
@@ -47,7 +71,7 @@ except ImportError:
47
  RAGGraphMemory = None
48
  MemoryConstants = None
49
 
50
- # ===== USAGE TRACKER =====
51
  from app.core.usage_tracker import init_tracker, tracker, Tier
52
 
53
  from app.api import (
@@ -61,6 +85,7 @@ from app.api import (
61
  routes_payments,
62
  webhooks,
63
  routes_users,
 
64
  )
65
  from app.api.deps import limiter
66
  from app.core.config import settings
@@ -75,18 +100,35 @@ logging.basicConfig(
75
 
76
  @asynccontextmanager
77
  async def lifespan(app: FastAPI):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  logger.info("🚀 Starting ARF API Control Plane")
79
  logger.debug(f"Python path: {sys.path}")
80
 
 
81
  if ARF_AVAILABLE:
82
  hmc_model_path = os.getenv("ARF_HMC_MODEL", "models/hmc_model.json")
83
  use_hyperpriors = os.getenv(
84
- "ARF_USE_HYPERPRIORS",
85
- "false").lower() == "true"
86
  logger.info(
87
  "Initializing RiskEngine – HMC model: %s, hyperpriors: %s",
88
  hmc_model_path,
89
- use_hyperpriors)
 
90
  try:
91
  app.state.risk_engine = RiskEngine(
92
  hmc_model_path=hmc_model_path,
@@ -99,6 +141,55 @@ async def lifespan(app: FastAPI):
99
  logger.exception("💥 Fatal error initializing RiskEngine")
100
  raise RuntimeError("RiskEngine initialization failed") from e
101
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
  try:
103
  app.state.policy_engine = PolicyEngine()
104
  logger.info("✅ PolicyEngine initialized successfully.")
@@ -120,12 +211,14 @@ async def lifespan(app: FastAPI):
120
  from sentence_transformers import SentenceTransformer
121
  logger.info(f"Loading epistemic model: {epistemic_model_name}")
122
  app.state.epistemic_model = SentenceTransformer(
123
- epistemic_model_name)
 
124
  app.state.epistemic_tokenizer = app.state.epistemic_model.tokenizer
125
  logger.info("✅ Epistemic model loaded.")
126
  except ImportError:
127
  logger.warning(
128
- "sentence-transformers not installed; epistemic signals will be zeros.")
 
129
  app.state.epistemic_model = None
130
  app.state.epistemic_tokenizer = None
131
  except Exception as e:
@@ -134,45 +227,94 @@ async def lifespan(app: FastAPI):
134
  app.state.epistemic_tokenizer = None
135
  else:
136
  logger.info(
137
- "EPISTEMIC_MODEL not set; epistemic signals will be zeros.")
 
138
  app.state.epistemic_model = None
139
  app.state.epistemic_tokenizer = None
140
  else:
141
  logger.warning(
142
- "agentic_reliability_framework not installed; risk engine, policy engine, RAG disabled.")
 
143
 
144
- # ===== USAGE TRACKER INITIALISATION =====
145
- if os.getenv("ARF_USAGE_TRACKING", "false").lower() == "true":
 
 
 
146
  logger.info("Initialising usage tracker...")
147
- # HARDCODED WRITABLE PATH – fixes 503 error
148
- init_tracker(
149
- db_path="/tmp/arf_usage.db", # was os.getenv("ARF_USAGE_DB_PATH", "arf_usage.db")
150
- redis_url=os.getenv("ARF_REDIS_URL")
151
- )
152
- # Seed initial API keys from environment variable (for testing / demo)
153
- api_keys_json = os.getenv("ARF_API_KEYS", "{}")
154
  try:
155
- api_keys = json.loads(api_keys_json)
156
- for key, tier_str in api_keys.items():
157
- try:
158
- tier = Tier(tier_str.lower())
159
- tracker.get_or_create_api_key(key, tier)
160
- logger.info(f"Seeded API key for tier {tier.value}")
161
- except ValueError:
162
- logger.warning(f"Invalid tier '{tier_str}' for key {key}, skipping")
163
- except json.JSONDecodeError:
164
- logger.warning("ARF_API_KEYS environment variable is not valid JSON; skipping seeding.")
165
- app.state.usage_tracker = tracker
166
- logger.info("✅ Usage tracker ready.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
167
  else:
168
- logger.info("Usage tracking disabled (ARF_USAGE_TRACKING not set to true).")
169
  app.state.usage_tracker = None
170
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
171
  yield
172
  logger.info("🛑 Shutting down ARF API")
173
 
174
 
175
  def create_app() -> FastAPI:
 
 
 
 
 
 
 
 
 
 
 
 
 
176
  app = FastAPI(
177
  title=settings.app_name,
178
  version="0.5.0",
@@ -182,6 +324,7 @@ def create_app() -> FastAPI:
182
  description="Agentic Reliability Framework (ARF) API",
183
  )
184
 
 
185
  allowed_origins = ["https://arf-frontend-sandy.vercel.app"]
186
  app.add_middleware(
187
  CORSMiddleware,
@@ -192,67 +335,64 @@ def create_app() -> FastAPI:
192
  )
193
  logger.debug("CORS middleware configured")
194
 
 
195
  if SLOWAPI_AVAILABLE:
196
  app.state.limiter = limiter
197
  app.add_exception_handler(
198
- RateLimitExceeded,
199
- _rate_limit_exceeded_handler)
200
  app.add_middleware(SlowAPIMiddleware)
201
  logger.debug("Rate limiter middleware configured")
202
  else:
203
  logger.debug("Rate limiter disabled (slowapi not installed)")
204
 
 
205
  if PROMETHEUS_AVAILABLE:
206
  Instrumentator().instrument(app).expose(app)
207
  logger.debug("Prometheus instrumentator configured")
208
  else:
209
- logger.debug(
210
- "Prometheus instrumentator disabled (module not installed)")
211
 
212
- # Include routers
213
  app.include_router(
214
- routes_incidents.router,
215
- prefix="/api/v1",
216
- tags=["incidents"])
217
  app.include_router(routes_risk.router, prefix="/api/v1", tags=["risk"])
218
  app.include_router(
219
- routes_intents.router,
220
- prefix="/api/v1",
221
- tags=["intents"])
222
  app.include_router(
223
- routes_history.router,
224
- prefix="/api/v1",
225
- tags=["history"])
226
  app.include_router(
227
- routes_governance.router,
228
- prefix="/api/v1",
229
- tags=["governance"])
230
  app.include_router(
231
- routes_memory.router,
232
- prefix="/v1/memory",
233
- tags=["memory"])
234
  app.include_router(
235
- routes_admin.router,
236
- prefix="/api/v1",
237
- tags=["admin"])
238
  app.include_router(
239
- routes_payments.router,
240
- prefix="/api/v1",
241
- tags=["payments"])
242
  app.include_router(
243
- webhooks.router,
244
- tags=["webhooks"])
245
  app.include_router(
246
- routes_users.router,
247
- prefix="/api/v1",
248
- tags=["users"])
 
 
249
  logger.debug("All API routers included")
250
 
251
  @app.get("/health", tags=["health"])
252
  async def health() -> Dict[str, str]:
 
253
  return {"status": "ok"}
254
 
255
  return app
256
 
257
 
258
- app = create_app()
 
1
  """
2
+ ARF API Control Plane Main Application Entry Point
3
+ ====================================================
4
+
5
+ The control plane serves as the HTTP layer between the **Agentic Reliability
6
+ Framework (ARF)** core engine and external consumers (front‑end dashboard,
7
+ enterprise clients, and monitoring infrastructure).
8
+
9
+ It is responsible for:
10
+
11
+ * **Lifetime management** of the Bayesian risk engine, policy engine,
12
+ semantic memory (RAG graph), and epistemic models.
13
+ * **Observability** via optional OpenTelemetry tracing and Prometheus metrics
14
+ (the latter exposed automatically by ``prometheus-fastapi-instrumentator``
15
+ on ``/metrics``).
16
+ * **Rate limiting** and **usage tracking** with atomic quota consumption.
17
+ * **CORS** configuration for the public ARF front‑end.
18
+ * **Database‑backed persistence** of the conjugate Bayesian posteriors so
19
+ that online learning survives restarts.
20
+ * **Automated Rust enforcer canary promotion** via Wilson confidence interval
21
+ monitoring of the agreement counters.
22
+
23
+ All heavy components are loaded **lazily and best‑effort** – if a dependency
24
+ is missing the API continues to serve health‑check and status endpoints,
25
+ degrading gracefully rather than crashing.
26
  """
27
  import logging
28
  import os
29
  import sys
30
  import json
31
+ import threading
32
+ import time as _time
33
  from contextlib import asynccontextmanager
34
  from typing import Dict
35
 
36
  from fastapi import FastAPI
37
  from fastapi.middleware.cors import CORSMiddleware
38
 
39
+ # ── Optional: Prometheus metrics ─────────────────────────────
40
  try:
41
  from prometheus_fastapi_instrumentator import Instrumentator
42
  PROMETHEUS_AVAILABLE = True
 
44
  PROMETHEUS_AVAILABLE = False
45
  Instrumentator = None
46
 
47
+ # ── Optional: rate‑limiting (slowapi) ────────────────────────
48
  try:
49
  from slowapi import _rate_limit_exceeded_handler
50
  from slowapi.errors import RateLimitExceeded
 
56
  RateLimitExceeded = None
57
  SlowAPIMiddleware = None
58
 
59
+ # ── Core ARF engine (optional but essential for governance) ──
60
  try:
61
  from agentic_reliability_framework.core.governance.risk_engine import RiskEngine
62
  from agentic_reliability_framework.core.governance.policy_engine import PolicyEngine
 
71
  RAGGraphMemory = None
72
  MemoryConstants = None
73
 
74
+ # ── Usage tracker ────────────────────────────────────────────
75
  from app.core.usage_tracker import init_tracker, tracker, Tier
76
 
77
  from app.api import (
 
85
  routes_payments,
86
  webhooks,
87
  routes_users,
88
+ routes_pricing,
89
  )
90
  from app.api.deps import limiter
91
  from app.core.config import settings
 
100
 
101
  @asynccontextmanager
102
  async def lifespan(app: FastAPI):
103
+ """
104
+ Application lifespan manager.
105
+
106
+ All initialisation that requires a running event loop (database
107
+ connections, model loading, etc.) happens **before** the ``yield``.
108
+ Cleanup (if any) happens after the ``yield``.
109
+
110
+ Initialisation order:
111
+ 1. Risk engine (Bayesian scoring + HMC).
112
+ 2. Load persisted conjugate posterior state (``beta_state`` table).
113
+ 3. OpenTelemetry tracing (console exporter by default).
114
+ 4. Policy engine, RAG memory, and epistemic model.
115
+ 5. Usage tracker (SQLite / Redis).
116
+ 6. Wilson confidence monitor for Rust enforcer canary promotion.
117
+ """
118
  logger.info("🚀 Starting ARF API Control Plane")
119
  logger.debug(f"Python path: {sys.path}")
120
 
121
+ # ── 1. Risk engine ────────────────────────────────────────
122
  if ARF_AVAILABLE:
123
  hmc_model_path = os.getenv("ARF_HMC_MODEL", "models/hmc_model.json")
124
  use_hyperpriors = os.getenv(
125
+ "ARF_USE_HYPERPRIORS", "false"
126
+ ).lower() == "true"
127
  logger.info(
128
  "Initializing RiskEngine – HMC model: %s, hyperpriors: %s",
129
  hmc_model_path,
130
+ use_hyperpriors,
131
+ )
132
  try:
133
  app.state.risk_engine = RiskEngine(
134
  hmc_model_path=hmc_model_path,
 
141
  logger.exception("💥 Fatal error initializing RiskEngine")
142
  raise RuntimeError("RiskEngine initialization failed") from e
143
 
144
+ # ── 2. Persisted Bayesian state ────────────────────���──
145
+ try:
146
+ from app.database.session import SessionLocal
147
+ from app.database.models_intents import BetaStateDB
148
+ from agentic_reliability_framework.core.governance.risk_engine import ActionCategory
149
+
150
+ db = SessionLocal()
151
+ try:
152
+ rows = db.query(BetaStateDB).all()
153
+ if rows:
154
+ state = {
155
+ ActionCategory(row.category): (row.alpha, row.beta)
156
+ for row in rows
157
+ }
158
+ app.state.risk_engine.beta_store.load_state(state)
159
+ logger.info(
160
+ "Loaded Bayesian posterior state from database (%d categories).",
161
+ len(state),
162
+ )
163
+ else:
164
+ logger.info(
165
+ "No persisted Bayesian state found; using default priors."
166
+ )
167
+ finally:
168
+ db.close()
169
+ except Exception as e:
170
+ logger.warning(
171
+ "Could not load Bayesian state from database: %s", e
172
+ )
173
+
174
+ # ── 3. Tracing (OpenTelemetry) ─────────────────────────
175
+ try:
176
+ from opentelemetry import trace
177
+ from opentelemetry.sdk.resources import SERVICE_NAME, Resource
178
+ from opentelemetry.sdk.trace import TracerProvider
179
+ from opentelemetry.sdk.trace.export import SimpleSpanProcessor, ConsoleSpanExporter
180
+ from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
181
+
182
+ resource = Resource.create({SERVICE_NAME: "arf-api"})
183
+ provider = TracerProvider(resource=resource)
184
+ provider.add_span_processor(SimpleSpanProcessor(ConsoleSpanExporter()))
185
+ trace.set_tracer_provider(provider)
186
+
187
+ FastAPIInstrumentor.instrument_app(app)
188
+ logger.info("✅ Tracing initialized (console exporter).")
189
+ except Exception as e:
190
+ logger.warning("Tracing initialization skipped: %s", e)
191
+
192
+ # ── 4. Policy engine, RAG, epistemic model ─────────────
193
  try:
194
  app.state.policy_engine = PolicyEngine()
195
  logger.info("✅ PolicyEngine initialized successfully.")
 
211
  from sentence_transformers import SentenceTransformer
212
  logger.info(f"Loading epistemic model: {epistemic_model_name}")
213
  app.state.epistemic_model = SentenceTransformer(
214
+ epistemic_model_name
215
+ )
216
  app.state.epistemic_tokenizer = app.state.epistemic_model.tokenizer
217
  logger.info("✅ Epistemic model loaded.")
218
  except ImportError:
219
  logger.warning(
220
+ "sentence-transformers not installed; epistemic signals will be zeros."
221
+ )
222
  app.state.epistemic_model = None
223
  app.state.epistemic_tokenizer = None
224
  except Exception as e:
 
227
  app.state.epistemic_tokenizer = None
228
  else:
229
  logger.info(
230
+ "EPISTEMIC_MODEL not set; epistemic signals will be zeros."
231
+ )
232
  app.state.epistemic_model = None
233
  app.state.epistemic_tokenizer = None
234
  else:
235
  logger.warning(
236
+ "agentic_reliability_framework not installed; risk engine, policy engine, RAG disabled."
237
+ )
238
 
239
+ # ── 5. Usage tracker ──────────────────────────────────────
240
+ usage_tracking_disabled = (
241
+ os.getenv("ARF_USAGE_TRACKING", "true").lower() == "false"
242
+ )
243
+ if not usage_tracking_disabled:
244
  logger.info("Initialising usage tracker...")
 
 
 
 
 
 
 
245
  try:
246
+ init_tracker(
247
+ db_path=os.getenv("ARF_USAGE_DB_PATH", "arf_usage.db"),
248
+ redis_url=os.getenv("ARF_REDIS_URL"),
249
+ )
250
+ # Seed initial API keys from environment variable (for testing / demo)
251
+ api_keys_json = os.getenv("ARF_API_KEYS", "{}")
252
+ try:
253
+ api_keys = json.loads(api_keys_json)
254
+ for key, tier_str in api_keys.items():
255
+ try:
256
+ tier = Tier(tier_str.lower())
257
+ tracker.get_or_create_api_key(key, tier)
258
+ logger.info(f"Seeded API key for tier {tier.value}")
259
+ except ValueError:
260
+ logger.warning(
261
+ f"Invalid tier '{tier_str}' for key {key}, skipping"
262
+ )
263
+ except json.JSONDecodeError:
264
+ logger.warning(
265
+ "ARF_API_KEYS environment variable is not valid JSON; skipping seeding."
266
+ )
267
+ app.state.usage_tracker = tracker
268
+ logger.info("✅ Usage tracker ready.")
269
+ except Exception as e:
270
+ logger.critical(f"Failed to initialise usage tracker: {e}")
271
+ raise RuntimeError("Usage tracker initialisation failed") from e
272
  else:
273
+ logger.info("Usage tracking disabled by ARF_USAGE_TRACKING=false.")
274
  app.state.usage_tracker = None
275
 
276
+ # ── 6. Wilson confidence monitor ──────────────────────────
277
+ try:
278
+ from app.services.wilson_monitor import update as wilson_update
279
+ from prometheus_client import REGISTRY
280
+
281
+ def _wilson_updater():
282
+ while True:
283
+ try:
284
+ agreed = REGISTRY.get_sample_value(
285
+ 'arf_rust_agreement_total', {'result': 'agreed'}
286
+ ) or 0.0
287
+ diverged = REGISTRY.get_sample_value(
288
+ 'arf_rust_agreement_total', {'result': 'diverged'}
289
+ ) or 0.0
290
+ wilson_update(int(agreed), int(diverged))
291
+ except Exception as e:
292
+ logger.debug("Wilson updater error: %s", e)
293
+ _time.sleep(300) # every 5 minutes
294
+
295
+ threading.Thread(target=_wilson_updater, daemon=True).start()
296
+ logger.info("✅ Wilson monitor background updater started.")
297
+ except Exception as e:
298
+ logger.warning("Wilson monitor initialization skipped: %s", e)
299
+
300
  yield
301
  logger.info("🛑 Shutting down ARF API")
302
 
303
 
304
  def create_app() -> FastAPI:
305
+ """
306
+ Build and configure the FastAPI application.
307
+
308
+ Middleware order:
309
+ 1. CORS (restricted to the public front‑end origin).
310
+ 2. Rate limiting (if slowapi is installed).
311
+ 3. Prometheus metrics exposition (if available).
312
+
313
+ All API routers are included under the ``/api/v1`` prefix except
314
+ memory (``/v1/memory``) and webhooks (root level).
315
+
316
+ A simple ``/health`` endpoint is provided for liveness probes.
317
+ """
318
  app = FastAPI(
319
  title=settings.app_name,
320
  version="0.5.0",
 
324
  description="Agentic Reliability Framework (ARF) API",
325
  )
326
 
327
+ # ── CORS ──────────────────────────────────────────────────
328
  allowed_origins = ["https://arf-frontend-sandy.vercel.app"]
329
  app.add_middleware(
330
  CORSMiddleware,
 
335
  )
336
  logger.debug("CORS middleware configured")
337
 
338
+ # ── Rate limiter ──────────────────────────────────────────
339
  if SLOWAPI_AVAILABLE:
340
  app.state.limiter = limiter
341
  app.add_exception_handler(
342
+ RateLimitExceeded, _rate_limit_exceeded_handler
343
+ )
344
  app.add_middleware(SlowAPIMiddleware)
345
  logger.debug("Rate limiter middleware configured")
346
  else:
347
  logger.debug("Rate limiter disabled (slowapi not installed)")
348
 
349
+ # ── Prometheus ────────────────────────────────────────────
350
  if PROMETHEUS_AVAILABLE:
351
  Instrumentator().instrument(app).expose(app)
352
  logger.debug("Prometheus instrumentator configured")
353
  else:
354
+ logger.debug("Prometheus instrumentator disabled (module not installed)")
 
355
 
356
+ # ── API Routers ───────────────────────────────────────────
357
  app.include_router(
358
+ routes_incidents.router, prefix="/api/v1", tags=["incidents"]
359
+ )
 
360
  app.include_router(routes_risk.router, prefix="/api/v1", tags=["risk"])
361
  app.include_router(
362
+ routes_intents.router, prefix="/api/v1", tags=["intents"]
363
+ )
 
364
  app.include_router(
365
+ routes_history.router, prefix="/api/v1", tags=["history"]
366
+ )
 
367
  app.include_router(
368
+ routes_governance.router, prefix="/api/v1", tags=["governance"]
369
+ )
 
370
  app.include_router(
371
+ routes_memory.router, prefix="/v1/memory", tags=["memory"]
372
+ )
 
373
  app.include_router(
374
+ routes_admin.router, prefix="/api/v1", tags=["admin"]
375
+ )
 
376
  app.include_router(
377
+ routes_payments.router, prefix="/api/v1", tags=["payments"]
378
+ )
 
379
  app.include_router(
380
+ webhooks.router, tags=["webhooks"]
381
+ )
382
  app.include_router(
383
+ routes_users.router, prefix="/api/v1", tags=["users"]
384
+ )
385
+ app.include_router(
386
+ routes_pricing.router, prefix="/api/v1", tags=["pricing"]
387
+ )
388
  logger.debug("All API routers included")
389
 
390
  @app.get("/health", tags=["health"])
391
  async def health() -> Dict[str, str]:
392
+ """Liveness probe – returns 200 when the application is running."""
393
  return {"status": "ok"}
394
 
395
  return app
396
 
397
 
398
+ app = create_app()
app/models/__init__.py CHANGED
@@ -26,4 +26,4 @@ __all__ = [
26
  "PermissionLevel",
27
  "Environment",
28
  "ChangeScope",
29
- ]
 
26
  "PermissionLevel",
27
  "Environment",
28
  "ChangeScope",
29
+ ]
app/models/incident_models.py CHANGED
@@ -4,10 +4,11 @@ from pydantic import BaseModel, Field
4
 
5
  class IncidentReport(BaseModel):
6
  service: str = Field(..., description="Service name")
7
- signal_type: Literal["latency", "error_rate", "cpu", "memory"] = Field(..., description="Type of signal")
 
8
  value: float = Field(..., description="Measured value")
9
 
10
 
11
  class IncidentResponse(BaseModel):
12
  service: str
13
- reliability: float
 
4
 
5
  class IncidentReport(BaseModel):
6
  service: str = Field(..., description="Service name")
7
+ signal_type: Literal["latency", "error_rate", "cpu",
8
+ "memory"] = Field(..., description="Type of signal")
9
  value: float = Field(..., description="Measured value")
10
 
11
 
12
  class IncidentResponse(BaseModel):
13
  service: str
14
+ reliability: float
app/models/infrastructure_intents.py CHANGED
@@ -1,45 +1,12 @@
1
  from pydantic import BaseModel, Field, field_validator
2
  from typing import Optional, Literal, List, Any, Dict
3
- from enum import Enum
4
 
5
- # ---------------------------------------------------------------------------
6
- # Fallback enums – used when the proprietary core engine is not installed.
7
- # These mirror the canonical definitions from the public specification.
8
- # ---------------------------------------------------------------------------
9
- class ResourceType(str, Enum):
10
- DATABASE = "database"
11
- STORAGE_ACCOUNT = "storage_account"
12
- VM = "vm"
13
- VIRTUAL_NETWORK = "virtual_network"
14
- # enterprise-only types omitted for public sandbox
15
-
16
- class PermissionLevel(str, Enum):
17
- READ = "read"
18
- WRITE = "write"
19
- ADMIN = "admin"
20
-
21
- class Environment(str, Enum):
22
- DEV = "dev"
23
- STAGING = "staging"
24
- PROD = "prod"
25
-
26
- class ChangeScope(str, Enum):
27
- MINOR = "minor"
28
- MAJOR = "major"
29
- CRITICAL = "critical"
30
- # ---------------------------------------------------------------------------
31
-
32
- # Optional import from protected core engine – not available in public Spaces
33
- try:
34
- from agentic_reliability_framework.core.governance.intents import (
35
- ResourceType,
36
- PermissionLevel,
37
- Environment,
38
- ChangeScope,
39
- )
40
- except ImportError:
41
- # The fallback enums defined above are used.
42
- pass
43
 
44
 
45
  class BaseIntentRequest(BaseModel):
@@ -91,4 +58,4 @@ class DeployConfigurationRequest(BaseIntentRequest):
91
  return v
92
 
93
 
94
- InfrastructureIntentRequest = ProvisionResourceRequest | GrantAccessRequest | DeployConfigurationRequest
 
1
  from pydantic import BaseModel, Field, field_validator
2
  from typing import Optional, Literal, List, Any, Dict
 
3
 
4
+ from agentic_reliability_framework.core.governance.intents import (
5
+ ResourceType,
6
+ PermissionLevel,
7
+ Environment,
8
+ ChangeScope,
9
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
 
12
  class BaseIntentRequest(BaseModel):
 
58
  return v
59
 
60
 
61
+ InfrastructureIntentRequest = ProvisionResourceRequest | GrantAccessRequest | DeployConfigurationRequest
app/models/intent_models.py CHANGED
@@ -11,4 +11,4 @@ class IntentSimulation(BaseModel):
11
 
12
  class IntentSimulationResponse(BaseModel):
13
  risk_score: float
14
- recommendation: Literal["safe_to_execute", "requires_approval", "blocked"]
 
11
 
12
  class IntentSimulationResponse(BaseModel):
13
  risk_score: float
14
+ recommendation: Literal["safe_to_execute", "requires_approval", "blocked"]
app/models/risk_models.py CHANGED
@@ -4,4 +4,4 @@ from pydantic import BaseModel
4
 
5
  class RiskResponse(BaseModel):
6
  system_risk: float
7
- status: Literal["low", "moderate", "high", "critical"]
 
4
 
5
  class RiskResponse(BaseModel):
6
  system_risk: float
7
+ status: Literal["low", "moderate", "high", "critical"]
app/services/incident_service.py CHANGED
@@ -3,5 +3,6 @@ from app.models.incident_models import IncidentReport
3
 
4
 
5
  def process_incident(report: IncidentReport) -> float:
6
- reliability = signal_to_reliability(report.value, signal_type=report.signal_type)
 
7
  return reliability
 
3
 
4
 
5
  def process_incident(report: IncidentReport) -> float:
6
+ reliability = signal_to_reliability(
7
+ report.value, signal_type=report.signal_type)
8
  return reliability
app/services/intent_adapter.py CHANGED
@@ -1,66 +1,163 @@
1
- from pydantic import BaseModel
2
- from typing import Optional, Dict, Any
3
-
4
- # ---------------------------------------------------------------------------
5
- # Local fallback intent classes – mirrors the proprietary core engine's contracts
6
- # ---------------------------------------------------------------------------
7
- class ProvisionResourceIntent(BaseModel):
8
- resource_type: str
9
- region: str
10
- size: str
11
- configuration: Dict[str, Any] = {}
12
- environment: str
13
- requester: str
14
- provenance: Dict[str, Any] = {}
15
-
16
- class GrantAccessIntent(BaseModel):
17
- principal: str
18
- permission_level: str
19
- resource_scope: str
20
- justification: Optional[str] = None
21
- requester: str
22
- provenance: Dict[str, Any] = {}
23
-
24
- class DeployConfigurationIntent(BaseModel):
25
- service_name: str
26
- change_scope: str
27
- deployment_target: str
28
- risk_level_hint: Optional[float] = None
29
- configuration: Dict[str, Any] = {}
30
- requester: str
31
- provenance: Dict[str, Any] = {}
32
- # ---------------------------------------------------------------------------
33
-
34
-
35
- def to_oss_intent(api_request):
36
- if api_request.intent_type == "provision_resource":
37
- return ProvisionResourceIntent(
38
- resource_type=api_request.resource_type.value if hasattr(api_request.resource_type, 'value') else str(api_request.resource_type),
39
- region=api_request.region,
40
- size=api_request.size,
41
- configuration=api_request.configuration,
42
- environment=api_request.environment.value if hasattr(api_request.environment, 'value') else str(api_request.environment),
43
- requester=api_request.requester,
44
- provenance=api_request.provenance,
45
- )
46
- elif api_request.intent_type == "grant_access":
47
- return GrantAccessIntent(
48
- principal=api_request.principal,
49
- permission_level=api_request.permission_level.value if hasattr(api_request.permission_level, 'value') else str(api_request.permission_level),
50
- resource_scope=api_request.resource_scope,
51
- justification=api_request.justification,
52
- requester=api_request.requester,
53
- provenance=api_request.provenance,
54
- )
55
- elif api_request.intent_type == "deploy_config":
56
- return DeployConfigurationIntent(
57
- service_name=api_request.service_name,
58
- change_scope=api_request.change_scope.value if hasattr(api_request.change_scope, 'value') else str(api_request.change_scope),
59
- deployment_target=api_request.deployment_target.value if hasattr(api_request.deployment_target, 'value') else str(api_request.deployment_target),
60
- risk_level_hint=api_request.risk_level_hint,
61
- configuration=api_request.configuration,
62
- requester=api_request.requester,
63
- provenance=api_request.provenance,
64
- )
65
  else:
66
- raise ValueError(f"Unknown intent type: {api_request.intent_type}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Intent Adapter converts API request payloads to ARF InfrastructureIntent objects.
3
+ Strict validation, no dummy fallbacks. All conversions are deterministic.
4
+ """
5
+
6
+ import logging
7
+ from typing import Any, Dict
8
+
9
+ from agentic_reliability_framework.core.governance.intents import (
10
+ ProvisionResourceIntent,
11
+ GrantAccessIntent,
12
+ DeployConfigurationIntent,
13
+ InfrastructureIntent,
14
+ )
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ class IntentAdapterError(Exception):
20
+ """Raised when intent conversion fails due to invalid input."""
21
+ pass
22
+
23
+
24
+ # Allowed values (from the framework's Literal definitions)
25
+ VALID_ENVIRONMENTS = {"dev", "staging", "prod", "test"}
26
+ VALID_RESOURCE_TYPES = {
27
+ "vm",
28
+ "storage_account",
29
+ "database",
30
+ "kubernetes_cluster",
31
+ "function_app",
32
+ "virtual_network"}
33
+
34
+
35
+ def to_oss_intent(api_request: Any) -> InfrastructureIntent:
36
+ """
37
+ Convert an API request object to the corresponding OSS InfrastructureIntent.
38
+ """
39
+ # Extract data
40
+ if hasattr(api_request, "model_dump"):
41
+ data = api_request.model_dump()
42
+ elif hasattr(api_request, "dict"):
43
+ data = api_request.dict()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  else:
45
+ data = dict(api_request)
46
+
47
+ intent_type = data.get("intent_type")
48
+ if not intent_type:
49
+ raise IntentAdapterError("Missing 'intent_type' in request")
50
+
51
+ environment = data.get("environment")
52
+ if not environment:
53
+ raise IntentAdapterError("Missing 'environment' field")
54
+ if environment not in VALID_ENVIRONMENTS:
55
+ raise IntentAdapterError(
56
+ f"Invalid environment: {environment}. Must be one of {VALID_ENVIRONMENTS}")
57
+
58
+ requester = data.get("requester")
59
+ if not requester:
60
+ raise IntentAdapterError("Missing 'requester' field")
61
+
62
+ if intent_type == "provision_resource":
63
+ return _to_provision_intent(data, environment, requester)
64
+ elif intent_type == "grant_access":
65
+ return _to_grant_intent(data, requester) # environment NOT passed
66
+ elif intent_type == "deploy_config":
67
+ return _to_deploy_intent(data, requester) # environment NOT passed
68
+ else:
69
+ raise IntentAdapterError(f"Unknown intent_type: {intent_type}")
70
+
71
+
72
+ def _to_provision_intent(data: Dict[str,
73
+ Any],
74
+ environment: str,
75
+ requester: str) -> ProvisionResourceIntent:
76
+ resource_type_str = data.get("resource_type")
77
+ if not resource_type_str:
78
+ raise IntentAdapterError(
79
+ "Missing 'resource_type' for provision_resource intent")
80
+ if resource_type_str not in VALID_RESOURCE_TYPES:
81
+ raise IntentAdapterError(f"Invalid resource_type: {resource_type_str}")
82
+
83
+ region = data.get("region")
84
+ if not region:
85
+ raise IntentAdapterError(
86
+ "Missing 'region' for provision_resource intent")
87
+
88
+ size = data.get("size")
89
+ if not size:
90
+ raise IntentAdapterError(
91
+ "Missing 'size' for provision_resource intent")
92
+
93
+ return ProvisionResourceIntent(
94
+ resource_type=resource_type_str,
95
+ region=region,
96
+ size=size,
97
+ environment=environment,
98
+ requester=requester,
99
+ configuration=data.get("configuration", {}),
100
+ provenance=data.get("provenance", {}),
101
+ )
102
+
103
+
104
+ def _to_grant_intent(data: Dict[str, Any],
105
+ requester: str) -> GrantAccessIntent:
106
+ principal = data.get("principal")
107
+ if not principal:
108
+ raise IntentAdapterError("Missing 'principal' for grant_access intent")
109
+
110
+ permission_level = data.get("permission_level")
111
+ if not permission_level:
112
+ raise IntentAdapterError(
113
+ "Missing 'permission_level' for grant_access intent")
114
+
115
+ resource_scope = data.get("resource_scope")
116
+ if not resource_scope:
117
+ raise IntentAdapterError(
118
+ "Missing 'resource_scope' for grant_access intent")
119
+
120
+ return GrantAccessIntent(
121
+ principal=principal,
122
+ permission_level=permission_level,
123
+ resource_scope=resource_scope,
124
+ requester=requester,
125
+ justification=data.get("justification", ""),
126
+ provenance=data.get("provenance", {}),
127
+ )
128
+
129
+
130
+ def _to_deploy_intent(data: Dict[str, Any],
131
+ requester: str) -> DeployConfigurationIntent:
132
+ service_name = data.get("service_name")
133
+ if not service_name:
134
+ raise IntentAdapterError(
135
+ "Missing 'service_name' for deploy_config intent")
136
+
137
+ change_scope = data.get("change_scope")
138
+ if not change_scope:
139
+ raise IntentAdapterError(
140
+ "Missing 'change_scope' for deploy_config intent")
141
+
142
+ deployment_target = data.get("deployment_target")
143
+ if not deployment_target:
144
+ raise IntentAdapterError(
145
+ "Missing 'deployment_target' for deploy_config intent")
146
+
147
+ # risk_level_hint expects a float; if not a number, set to None
148
+ risk_hint = data.get("risk_level_hint")
149
+ if risk_hint is not None:
150
+ try:
151
+ risk_hint = float(risk_hint)
152
+ except (TypeError, ValueError):
153
+ risk_hint = None
154
+
155
+ return DeployConfigurationIntent(
156
+ service_name=service_name,
157
+ change_scope=change_scope,
158
+ deployment_target=deployment_target,
159
+ requester=requester,
160
+ risk_level_hint=risk_hint,
161
+ configuration=data.get("configuration", {}),
162
+ provenance=data.get("provenance", {}),
163
+ )
app/services/intent_service.py CHANGED
@@ -7,7 +7,8 @@ logger = logging.getLogger(__name__)
7
 
8
  # Note: This endpoint is deprecated. Use /v1/intents/evaluate instead.
9
  def simulate_intent(intent: IntentSimulation) -> dict:
10
- logger.warning("Deprecated endpoint /simulate_intent used. Please migrate to /v1/intents/evaluate.")
 
11
  # For backward compatibility, we still use random risk.
12
  risk_score = random.uniform(0, 1)
13
  if risk_score < 0.2:
 
7
 
8
  # Note: This endpoint is deprecated. Use /v1/intents/evaluate instead.
9
  def simulate_intent(intent: IntentSimulation) -> dict:
10
+ logger.warning(
11
+ "Deprecated endpoint /simulate_intent used. Please migrate to /v1/intents/evaluate.")
12
  # For backward compatibility, we still use random risk.
13
  risk_score = random.uniform(0, 1)
14
  if risk_score < 0.2:
app/services/intent_store.py CHANGED
@@ -13,7 +13,8 @@ def save_evaluated_intent(
13
  environment: str,
14
  risk_score: float
15
  ) -> IntentDB:
16
- existing = db.query(IntentDB).filter(IntentDB.deterministic_id == deterministic_id).one_or_none()
 
17
  if existing:
18
  existing.evaluated_at = datetime.datetime.utcnow()
19
  existing.risk_score = str(risk_score)
@@ -38,5 +39,8 @@ def save_evaluated_intent(
38
  return intent
39
 
40
 
41
- def get_intent_by_deterministic_id(db: Session, deterministic_id: str) -> Optional[IntentDB]:
42
- return db.query(IntentDB).filter(IntentDB.deterministic_id == deterministic_id).one_or_none()
 
 
 
 
13
  environment: str,
14
  risk_score: float
15
  ) -> IntentDB:
16
+ existing = db.query(IntentDB).filter(
17
+ IntentDB.deterministic_id == deterministic_id).one_or_none()
18
  if existing:
19
  existing.evaluated_at = datetime.datetime.utcnow()
20
  existing.risk_score = str(risk_score)
 
39
  return intent
40
 
41
 
42
+ def get_intent_by_deterministic_id(
43
+ db: Session,
44
+ deterministic_id: str) -> Optional[IntentDB]:
45
+ return db.query(IntentDB).filter(
46
+ IntentDB.deterministic_id == deterministic_id).one_or_none()
app/services/outcome_service.py CHANGED
@@ -1,42 +1,53 @@
 
 
1
  import datetime
2
  import logging
3
  from typing import Optional, Dict, Any
4
 
5
  from sqlalchemy.orm import Session
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
- from app.database.models_intents import IntentDB, OutcomeDB
8
 
9
  # ---------------------------------------------------------------------------
10
- # Local fallback types dummy RiskEngine and intent classes
11
- # ---------------------------------------------------------------------------
12
- class RiskEngine:
13
- def update_outcome(self, intent, success):
14
- pass
15
-
16
- class ProvisionResourceIntent:
17
- def __init__(self, **kwargs):
18
- for k, v in kwargs.items():
19
- setattr(self, k, v)
20
-
21
- class GrantAccessIntent:
22
- def __init__(self, **kwargs):
23
- for k, v in kwargs.items():
24
- setattr(self, k, v)
25
-
26
- class DeployConfigurationIntent:
27
- def __init__(self, **kwargs):
28
- for k, v in kwargs.items():
29
- setattr(self, k, v)
30
  # ---------------------------------------------------------------------------
31
-
32
- logger = logging.getLogger(__name__)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
 
35
  class OutcomeConflictError(Exception):
 
36
  pass
37
 
38
 
39
- def reconstruct_oss_intent_from_json(oss_json: Dict[str, Any]):
 
 
40
  intent_type = oss_json.get("intent_type")
41
  if intent_type == "provision_resource":
42
  return ProvisionResourceIntent(**oss_json)
@@ -46,22 +57,7 @@ def reconstruct_oss_intent_from_json(oss_json: Dict[str, Any]):
46
  return DeployConfigurationIntent(**oss_json)
47
  else:
48
  raise ValueError(
49
- f"Cannot reconstruct intent from JSON: missing or unknown intent_type {intent_type}"
50
- )
51
-
52
-
53
- def _create_dummy_intent(intent_type: str):
54
- if intent_type == "ProvisionResourceIntent":
55
- return ProvisionResourceIntent(
56
- resource_type="vm",
57
- region="eastus",
58
- size="Standard_D2s_v3",
59
- environment="dev",
60
- requester="system"
61
- )
62
- else:
63
- logger.warning("Dummy intent creation not implemented for %s", intent_type)
64
- return None
65
 
66
 
67
  def record_outcome(
@@ -70,50 +66,114 @@ def record_outcome(
70
  success: bool,
71
  recorded_by: Optional[str],
72
  notes: Optional[str],
73
- risk_engine: RiskEngine
 
74
  ) -> OutcomeDB:
75
- intent = db.query(IntentDB).filter(IntentDB.deterministic_id == deterministic_id).one_or_none()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  if not intent:
77
  raise ValueError(f"Intent not found: {deterministic_id}")
78
 
79
- existing_outcome = db.query(OutcomeDB).filter(OutcomeDB.intent_id == intent.id).one_or_none()
 
 
80
  if existing_outcome:
81
  if existing_outcome.success == success:
82
  return existing_outcome
83
- raise OutcomeConflictError("Outcome already recorded with different result")
 
 
 
 
84
 
 
85
  outcome = OutcomeDB(
86
  intent_id=intent.id,
87
  success=bool(success),
88
  recorded_by=recorded_by,
89
  notes=notes,
90
- recorded_at=datetime.datetime.utcnow()
 
91
  )
92
  db.add(outcome)
93
- db.commit()
94
- db.refresh(outcome)
95
 
96
- # Reconstruct intent and update risk engine (mock)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
  oss_intent = None
98
  if intent.oss_payload:
99
  try:
100
  oss_intent = reconstruct_oss_intent_from_json(intent.oss_payload)
101
  except Exception as e:
102
- logger.warning(
103
- "Failed to reconstruct OSS intent for %s: %s. Using dummy fallback.",
104
- deterministic_id, e
105
- )
106
- oss_intent = _create_dummy_intent(intent.intent_type)
107
  else:
108
- oss_intent = _create_dummy_intent(intent.intent_type)
 
 
 
109
 
110
  if oss_intent is not None:
111
  try:
112
  risk_engine.update_outcome(oss_intent, success)
 
 
 
 
 
 
113
  except Exception as e:
114
  logger.exception(
115
  "Failed to update RiskEngine after recording outcome for intent %s: %s",
116
- deterministic_id, e
117
- )
 
 
 
 
 
118
 
119
- return outcome
 
1
+ """Outcome recording with idempotency, no dummy fallbacks, and timezone-aware timestamps."""
2
+
3
  import datetime
4
  import logging
5
  from typing import Optional, Dict, Any
6
 
7
  from sqlalchemy.orm import Session
8
+ from sqlalchemy.exc import IntegrityError
9
+
10
+ from agentic_reliability_framework.core.governance.risk_engine import RiskEngine
11
+ from agentic_reliability_framework.core.governance.intents import (
12
+ InfrastructureIntent,
13
+ ProvisionResourceIntent,
14
+ GrantAccessIntent,
15
+ DeployConfigurationIntent,
16
+ )
17
+ from app.database.models_intents import IntentDB, OutcomeDB, BetaStateDB
18
+
19
+ logger = logging.getLogger(__name__)
20
 
 
21
 
22
  # ---------------------------------------------------------------------------
23
+ # NEW: small helper to persist the conjugate posterior state
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  # ---------------------------------------------------------------------------
25
+ def _persist_beta_state(db: Session, risk_engine: RiskEngine) -> None:
26
+ """
27
+ Write the current Beta posterior parameters to the beta_state table.
28
+ This is called after every outcome update so that online learning
29
+ survives restarts.
30
+ """
31
+ try:
32
+ state = risk_engine.beta_store.get_state()
33
+ for cat, (alpha, beta) in state.items():
34
+ # Upsert: if the category already exists, update it
35
+ db.merge(BetaStateDB(category=cat.value, alpha=alpha, beta=beta))
36
+ db.commit()
37
+ logger.debug("Persisted Beta posterior parameters to database.")
38
+ except Exception as e:
39
+ db.rollback()
40
+ logger.error("Failed to persist beta state: %s", e)
41
 
42
 
43
  class OutcomeConflictError(Exception):
44
+ """Raised when an outcome already exists for the same intent with a different result."""
45
  pass
46
 
47
 
48
+ def reconstruct_oss_intent_from_json(
49
+ oss_json: Dict[str, Any]) -> InfrastructureIntent:
50
+ """Reconstruct OSS intent from stored JSON. Raises ValueError on failure."""
51
  intent_type = oss_json.get("intent_type")
52
  if intent_type == "provision_resource":
53
  return ProvisionResourceIntent(**oss_json)
 
57
  return DeployConfigurationIntent(**oss_json)
58
  else:
59
  raise ValueError(
60
+ f"Cannot reconstruct intent from JSON: missing or unknown intent_type {intent_type}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
 
63
  def record_outcome(
 
66
  success: bool,
67
  recorded_by: Optional[str],
68
  notes: Optional[str],
69
+ risk_engine: RiskEngine,
70
+ idempotency_key: Optional[str] = None,
71
  ) -> OutcomeDB:
72
+ """
73
+ Record an outcome for a previously evaluated intent.
74
+
75
+ Idempotent: calling twice with the same (deterministic_id, success) returns the same record.
76
+ If the outcome already exists with a different success value, raises OutcomeConflictError.
77
+
78
+ No dummy intents are created. If the OSS intent cannot be reconstructed, the risk engine
79
+ is NOT updated – we log an error and still record the outcome.
80
+
81
+ Args:
82
+ db: SQLAlchemy session.
83
+ deterministic_id: Unique identifier of the original intent.
84
+ success: Whether the action succeeded (True) or failed (False).
85
+ recorded_by: Optional user or system identifier.
86
+ notes: Optional human-readable notes.
87
+ risk_engine: ARF risk engine instance (may be updated).
88
+ idempotency_key: Optional caller-provided idempotency token.
89
+
90
+ Returns:
91
+ The recorded OutcomeDB object.
92
+
93
+ Raises:
94
+ ValueError: If intent not found or reconstruction fails fatally.
95
+ OutcomeConflictError: If a conflicting outcome already exists.
96
+ """
97
+ # 1. Fetch the original intent record
98
+ intent = db.query(IntentDB).filter(
99
+ IntentDB.deterministic_id == deterministic_id).one_or_none()
100
  if not intent:
101
  raise ValueError(f"Intent not found: {deterministic_id}")
102
 
103
+ # 2. Idempotency / conflict check with database-level uniqueness
104
+ existing_outcome = db.query(OutcomeDB).filter(
105
+ OutcomeDB.intent_id == intent.id).one_or_none()
106
  if existing_outcome:
107
  if existing_outcome.success == success:
108
  return existing_outcome
109
+ db.rollback()
110
+ raise OutcomeConflictError(
111
+ f"Outcome already recorded for intent {deterministic_id} with different result "
112
+ f"(existing={existing_outcome.success}, new={success})"
113
+ )
114
 
115
+ # 3. Create outcome record
116
  outcome = OutcomeDB(
117
  intent_id=intent.id,
118
  success=bool(success),
119
  recorded_by=recorded_by,
120
  notes=notes,
121
+ recorded_at=datetime.datetime.now(datetime.timezone.utc),
122
+ idempotency_key=idempotency_key,
123
  )
124
  db.add(outcome)
 
 
125
 
126
+ # 4. Attempt to commit; handle duplicate key errors for idempotency
127
+ try:
128
+ db.commit()
129
+ db.refresh(outcome)
130
+ except IntegrityError as e:
131
+ db.rollback()
132
+ if "idempotency_key" in str(e) and idempotency_key:
133
+ existing = db.query(OutcomeDB).filter(
134
+ OutcomeDB.idempotency_key == idempotency_key).first()
135
+ if existing:
136
+ logger.info(
137
+ "Idempotent request for key %s, returning existing outcome",
138
+ idempotency_key)
139
+ return existing
140
+ raise
141
+
142
+ # 5. Update RiskEngine ONLY if we can reconstruct a valid OSS intent
143
  oss_intent = None
144
  if intent.oss_payload:
145
  try:
146
  oss_intent = reconstruct_oss_intent_from_json(intent.oss_payload)
147
  except Exception as e:
148
+ logger.error(
149
+ "Failed to reconstruct OSS intent for %s: %s. RiskEngine will NOT be updated.",
150
+ deterministic_id,
151
+ e,
152
+ exc_info=True)
153
  else:
154
+ logger.warning(
155
+ "No oss_payload stored for intent %s – cannot update RiskEngine.",
156
+ deterministic_id
157
+ )
158
 
159
  if oss_intent is not None:
160
  try:
161
  risk_engine.update_outcome(oss_intent, success)
162
+
163
+ # ----------------------------------------------------------------
164
+ # PERSISTENCE: after updating the conjugate posterior, write it
165
+ # ----------------------------------------------------------------
166
+ _persist_beta_state(db, risk_engine)
167
+
168
  except Exception as e:
169
  logger.exception(
170
  "Failed to update RiskEngine after recording outcome for intent %s: %s",
171
+ deterministic_id,
172
+ e)
173
+ else:
174
+ logger.info(
175
+ "Skipped RiskEngine update for intent %s (no valid OSS intent)",
176
+ deterministic_id
177
+ )
178
 
179
+ return outcome
app/services/risk_service.py CHANGED
@@ -1,97 +1,376 @@
 
 
 
 
 
 
 
 
 
 
 
1
  from typing import Optional, List, Dict, Any
2
- from enum import Enum
3
-
4
- # ---------------------------------------------------------------------------
5
- # Local fallback types – everything needed for the sandbox mock
6
- # ---------------------------------------------------------------------------
7
- class HealingAction(str, Enum):
8
- NO_ACTION = "NO_ACTION"
9
- RESTART_CONTAINER = "RESTART_CONTAINER"
10
- SCALE_OUT = "SCALE_OUT"
11
- ROLLBACK = "ROLLBACK"
12
- CIRCUIT_BREAKER = "CIRCUIT_BREAKER"
13
- TRAFFIC_SHIFT = "TRAFFIC_SHIFT"
14
- ALERT_TEAM = "ALERT_TEAM"
15
-
16
- class InfrastructureIntent:
17
- pass
18
-
19
- class RiskEngine:
20
- def calculate_risk(self, intent, cost_estimate, policy_violations):
21
- # Return a mock risk score
22
- return 0.35, "Mock sandbox risk", {"conjugate_mean": 0.35}
23
-
24
- class PolicyEngine:
25
- def __init__(self):
26
- self.policies = []
27
- self.use_decision_engine = True
28
- def evaluate_policies(self, event):
29
- return [HealingAction.NO_ACTION]
30
-
31
- class DecisionEngine:
32
- def __init__(self, **kwargs):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  pass
34
- def select_optimal_action(self, actions, event, **kwargs):
35
- return type('obj', (object,), {
36
- 'best_action': HealingAction.NO_ACTION,
37
- 'expected_utility': 0.0,
38
- 'alternatives': [],
39
- 'explanation': 'Mock decision engine in sandbox',
40
- 'raw_data': {},
41
- })()
42
- def compute_risk(self, action, event, component):
43
- return 0.0
44
-
45
- class RAGGraphMemory:
46
- pass
47
-
48
- class ReliabilityEvent:
49
- component: str = "default"
50
- latency_p99: float = 0.0
51
- error_rate: float = 0.0
52
- cpu_util: Optional[float] = None
53
- memory_util: Optional[float] = None
54
- # ---------------------------------------------------------------------------
 
 
 
 
 
 
 
 
 
 
55
 
56
 
57
  def evaluate_intent(
58
  engine: RiskEngine,
59
- intent,
60
  cost_estimate: Optional[float],
61
  policy_violations: List[str]
62
  ) -> dict:
63
- """Mock sandbox evaluation – returns a fixed risk score."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  return {
65
- "risk_score": 0.38,
66
- "explanation": "Sandbox mock: high latency detected, escalating.",
67
- "contributions": {"conjugate_mean": 0.38}
68
  }
69
 
70
 
71
  def evaluate_healing_decision(
72
- event,
73
  policy_engine: PolicyEngine,
74
  decision_engine: Optional[DecisionEngine] = None,
75
  rag_graph: Optional[RAGGraphMemory] = None,
76
  model=None,
77
  tokenizer=None,
78
  ) -> Dict[str, Any]:
79
- """Mock sandbox healing evaluation – always returns NO_ACTION."""
80
- return {
81
- "risk_score": 0.0,
82
- "selected_action": HealingAction.NO_ACTION.value,
83
- "expected_utility": 0.0,
84
- "alternatives": [],
85
- "explanation": "Sandbox mock: no healing actions evaluated.",
86
- "epistemic_signals": {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
  "entropy": 0.0,
88
  "contradiction": 0.0,
89
  "evidence_lift": 0.0,
90
  "hallucination_risk": 0.0,
91
- },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  }
93
 
94
 
95
  def get_system_risk() -> float:
96
- import random
97
- return round(random.uniform(0, 1), 2)
 
 
 
 
 
 
 
1
+ """
2
+ Risk service – integrates ARF risk engine, policy engine, and decision engine.
3
+ Deterministic, no random fallbacks, explicit error handling.
4
+
5
+ Version: 2026-05-04 – added Prometheus metrics for observability.
6
+ """
7
+
8
+ import json
9
+ import logging
10
+ import os
11
+ import time
12
  from typing import Optional, List, Dict, Any
13
+
14
+ from agentic_reliability_framework.core.governance.risk_engine import RiskEngine
15
+ from agentic_reliability_framework.core.governance.intents import InfrastructureIntent
16
+ from agentic_reliability_framework.core.models.event import ReliabilityEvent, HealingAction
17
+ from agentic_reliability_framework.core.governance.policy_engine import PolicyEngine
18
+ from agentic_reliability_framework.core.decision.decision_engine import DecisionEngine
19
+ from agentic_reliability_framework.runtime.memory.rag_graph import RAGGraphMemory
20
+ from agentic_reliability_framework.core.research.eclipse_probe import compute_epistemic_risk
21
+
22
+ # ── optional tracing ─────────────────────────────────────────
23
+ try:
24
+ from opentelemetry import trace
25
+ _tracer = trace.get_tracer(__name__)
26
+ OTEL_AVAILABLE = True
27
+ except ImportError:
28
+ OTEL_AVAILABLE = False
29
+ _tracer = None
30
+
31
+ # ── Prometheus metrics (always registered; no‑op if not scraped)
32
+ from prometheus_client import Counter, Histogram
33
+
34
+ _EVAL_COUNTER = Counter(
35
+ "arf_evaluations_total",
36
+ "Total evaluation calls (intent + healing), partitioned by engine and status.",
37
+ ["engine", "status"],
38
+ )
39
+
40
+ _EVAL_DURATION = Histogram(
41
+ "arf_evaluation_duration_seconds",
42
+ "End‑to‑end latency of evaluation calls.",
43
+ ["engine"],
44
+ buckets=(0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0),
45
+ )
46
+
47
+ _RUST_AGREEMENT = Counter(
48
+ "arf_rust_agreement_total",
49
+ "Agreement between Rust enforcer and Python policy evaluation.",
50
+ ["result"], # "agreed" or "diverged"
51
+ )
52
+
53
+ # ── optional Rust enforcer (shadow mode) ──────────────────────
54
+ _RUST_ENFORCER_AVAILABLE = False
55
+ _rust_evaluator = None # singleton per process
56
+ _rust_policy_json: Optional[str] = None
57
+
58
+ if os.getenv("ARF_USE_RUST_ENFORCER", "false").lower() == "true":
59
+ try:
60
+ import arf_enforcer
61
+ _RUST_ENFORCER_AVAILABLE = True
62
+ except ImportError:
63
  pass
64
+
65
+ # Default OSS policy tree – mirrors the hard‑coded rules in the Python PolicyEvaluator
66
+ # that check region, resource type, and max permission level.
67
+ _OSS_POLICY_TREE_JSON = json.dumps({
68
+ "And": [
69
+ {"Atomic": {"RegionAllowed": {"allowed_regions": ["eastus"]}}},
70
+ {"Atomic": {"ResourceTypeRestricted": {
71
+ "forbidden_types": ["DATABASE_DROP", "FULL_ROLLOUT", "SYSTEM_SHUTDOWN", "SECRET_ROTATION"]
72
+ }}},
73
+ {"Atomic": {"MaxPermissionLevel": {"max_level": "admin"}}}
74
+ ]
75
+ })
76
+
77
+
78
+ def _ensure_rust_evaluator() -> bool:
79
+ """Lazy initialise the Rust policy evaluator. Returns True on success."""
80
+ global _rust_evaluator, _rust_policy_json
81
+ if _rust_evaluator is not None:
82
+ return True
83
+ if not _RUST_ENFORCER_AVAILABLE:
84
+ return False
85
+ try:
86
+ _rust_policy_json = _OSS_POLICY_TREE_JSON
87
+ _rust_evaluator = arf_enforcer.PyPolicyEvaluator(_rust_policy_json)
88
+ return True
89
+ except Exception:
90
+ _rust_evaluator = None
91
+ return False
92
+
93
+
94
+ logger = logging.getLogger(__name__)
95
 
96
 
97
  def evaluate_intent(
98
  engine: RiskEngine,
99
+ intent: InfrastructureIntent,
100
  cost_estimate: Optional[float],
101
  policy_violations: List[str]
102
  ) -> dict:
103
+ """
104
+ Evaluate an infrastructure intent using the Bayesian risk engine.
105
+
106
+ Optionally shadows the policy evaluation with the Rust enforcer when
107
+ the environment variable ARF_USE_RUST_ENFORCER is set to "true".
108
+ Any divergence is logged and counted as a Prometheus metric.
109
+
110
+ Parameters
111
+ ----------
112
+ engine : RiskEngine
113
+ Initialised ARF Bayesian risk engine.
114
+ intent : InfrastructureIntent
115
+ The infrastructure request to evaluate.
116
+ cost_estimate : float or None
117
+ Estimated monthly cost (used by cost‑threshold policies).
118
+ policy_violations : list[str]
119
+ Pre‑computed policy violation strings (from the Python evaluator).
120
+
121
+ Returns
122
+ -------
123
+ dict
124
+ Keys: risk_score, explanation, contributions.
125
+ """
126
+ t0 = time.monotonic()
127
+ span = None
128
+ if OTEL_AVAILABLE and _tracer:
129
+ span = _tracer.start_span("risk_service.evaluate_intent")
130
+ span.set_attribute("intent_type", type(intent).__name__)
131
+
132
+ # ── Shadow Rust enforcer (best‑effort, non‑blocking) ──────
133
+ if _RUST_ENFORCER_AVAILABLE and _ensure_rust_evaluator():
134
+ try:
135
+ rust_intent = {
136
+ "action": getattr(intent, "intent_type", "unknown"),
137
+ "component": getattr(intent, "service_name", "unknown"),
138
+ "region": getattr(intent, "region", None),
139
+ "resource_type": getattr(intent, "resource_type", None),
140
+ "permission_level": getattr(intent, "permission_level", None),
141
+ "extra": {}
142
+ }
143
+ rust_raw = _rust_evaluator.evaluate(
144
+ json.dumps(rust_intent), cost_estimate
145
+ )
146
+ rust_violations = json.loads(rust_raw)
147
+
148
+ agreed = set(rust_violations) == set(policy_violations)
149
+ _RUST_AGREEMENT.labels(result="agreed" if agreed else "diverged").inc()
150
+ if not agreed:
151
+ msg = (
152
+ "Rust enforcer divergence: "
153
+ f"Rust={sorted(rust_violations)} Python={sorted(policy_violations)}"
154
+ )
155
+ logger.warning(msg)
156
+ if span:
157
+ span.add_event("rust_enforcer_divergence", {
158
+ "rust_violations": rust_violations,
159
+ "python_violations": policy_violations
160
+ })
161
+ except Exception as exc:
162
+ logger.debug("Rust enforcer shadow evaluation failed: %s", exc)
163
+
164
+ # ── Core risk evaluation ──────────────────────────────────
165
+
166
+ # ── Automated canary promotion ──────────────────────────
167
+ if _RUST_ENFORCER_AVAILABLE and os.getenv("ARF_RUST_CANARY", "false").lower() == "true":
168
+ try:
169
+ from prometheus_client import REGISTRY
170
+ lower = REGISTRY.get_sample_value("arf_rust_agreement_lower_bound", {})
171
+ if lower is not None and lower > 0.9999:
172
+ policy_violations = rust_violations
173
+ if span:
174
+ span.set_attribute("rust_enforcer_active", True)
175
+ except Exception:
176
+ pass
177
+ try:
178
+ score, explanation, contributions = engine.calculate_risk(
179
+ intent=intent,
180
+ cost_estimate=cost_estimate,
181
+ policy_violations=policy_violations
182
+ )
183
+ engine_label = "python"
184
+ status = "success"
185
+ except Exception:
186
+ _EVAL_COUNTER.labels(engine="python", status="error").inc()
187
+ _EVAL_DURATION.labels(engine="python").observe(time.monotonic() - t0)
188
+ raise
189
+
190
+ _EVAL_COUNTER.labels(engine=engine_label, status=status).inc()
191
+ _EVAL_DURATION.labels(engine=engine_label).observe(time.monotonic() - t0)
192
+
193
+ if span:
194
+ span.set_attribute("risk_score", score)
195
+ if _RUST_ENFORCER_AVAILABLE:
196
+ span.set_attribute("rust_enforcer_available", True)
197
+ span.end()
198
+
199
  return {
200
+ "risk_score": score,
201
+ "explanation": explanation,
202
+ "contributions": contributions
203
  }
204
 
205
 
206
  def evaluate_healing_decision(
207
+ event: ReliabilityEvent,
208
  policy_engine: PolicyEngine,
209
  decision_engine: Optional[DecisionEngine] = None,
210
  rag_graph: Optional[RAGGraphMemory] = None,
211
  model=None,
212
  tokenizer=None,
213
  ) -> Dict[str, Any]:
214
+ """
215
+ Evaluate healing actions for a given reliability event using decision‑theoretic selection.
216
+ Includes epistemic risk signals from the eclipse probe.
217
+
218
+ Parameters
219
+ ----------
220
+ event : ReliabilityEvent
221
+ The incident event containing latency, error rate, etc.
222
+ policy_engine : PolicyEngine
223
+ The ARF healing policy engine with configured policies.
224
+ decision_engine : DecisionEngine, optional
225
+ If omitted, a default instance is created.
226
+ rag_graph : RAGGraphMemory, optional
227
+ Semantic memory for similar incident retrieval.
228
+ model, tokenizer : optional
229
+ HuggingFace model and tokenizer for epistemic risk computation.
230
+
231
+ Returns
232
+ -------
233
+ dict
234
+ Keys: risk_score, selected_action, expected_utility, alternatives,
235
+ explanation, epistemic_signals.
236
+ """
237
+ t0 = time.monotonic()
238
+ span = None
239
+ if OTEL_AVAILABLE and _tracer:
240
+ span = _tracer.start_span("risk_service.evaluate_healing")
241
+ span.set_attribute("component", event.component)
242
+
243
+ # If decision_engine not provided, try to get from policy_engine
244
+ if decision_engine is None and hasattr(policy_engine, 'decision_engine'):
245
+ decision_engine = policy_engine.decision_engine
246
+
247
+ # If still None, create a minimal one (global stats only)
248
+ if decision_engine is None:
249
+ logger.debug("No DecisionEngine provided; creating default instance")
250
+ decision_engine = DecisionEngine(rag_graph=rag_graph)
251
+
252
+ # Get raw candidate actions (by temporarily disabling decision engine)
253
+ orig_use = policy_engine.use_decision_engine
254
+ try:
255
+ policy_engine.use_decision_engine = False
256
+ raw_actions = policy_engine.evaluate_policies(event)
257
+ finally:
258
+ policy_engine.use_decision_engine = orig_use
259
+
260
+ # If no actions, return NO_ACTION
261
+ if not raw_actions or raw_actions == [HealingAction.NO_ACTION]:
262
+ if span:
263
+ span.set_attribute("selected_action", HealingAction.NO_ACTION.value)
264
+ span.end()
265
+ _EVAL_COUNTER.labels(engine="python", status="success").inc()
266
+ _EVAL_DURATION.labels(engine="python").observe(time.monotonic() - t0)
267
+ return {
268
+ "risk_score": 0.0,
269
+ "selected_action": HealingAction.NO_ACTION.value,
270
+ "expected_utility": 0.0,
271
+ "alternatives": [],
272
+ "explanation": "No candidate actions triggered.",
273
+ "epistemic_signals": None,
274
+ }
275
+
276
+ # Build reasoning text from policies that triggered the actions
277
+ reasoning_parts = []
278
+ for policy in policy_engine.policies:
279
+ if any(a in policy.actions for a in raw_actions):
280
+ conditions_str = ", ".join(
281
+ f"{c.metric} {c.operator} {c.threshold}" for c in policy.conditions
282
+ )
283
+ reasoning_parts.append(
284
+ f"Policy {policy.name} triggered by {conditions_str} → actions {[a.value for a in policy.actions]}"
285
+ )
286
+ reasoning_text = " ".join(reasoning_parts)
287
+
288
+ # Build evidence text from the event
289
+ evidence_text = (
290
+ f"Component: {event.component}, "
291
+ f"latency_p99: {event.latency_p99}, "
292
+ f"error_rate: {event.error_rate}, "
293
+ f"cpu_util: {event.cpu_util}, "
294
+ f"memory_util: {event.memory_util}"
295
+ )
296
+
297
+ # Compute epistemic signals (if model/tokenizer provided)
298
+ epistemic_signals = None
299
+ if model is not None and tokenizer is not None:
300
+ try:
301
+ epistemic_signals = compute_epistemic_risk(
302
+ reasoning_text, evidence_text, model, tokenizer
303
+ )
304
+ except Exception as e:
305
+ logger.error(f"Failed to compute epistemic risk: {e}")
306
+ epistemic_signals = {
307
+ "entropy": 0.0,
308
+ "contradiction": 0.0,
309
+ "evidence_lift": 0.0,
310
+ "hallucination_risk": 0.0,
311
+ }
312
+ else:
313
+ logger.debug("Epistemic model/tokenizer not provided; using zero signals")
314
+ epistemic_signals = {
315
  "entropy": 0.0,
316
  "contradiction": 0.0,
317
  "evidence_lift": 0.0,
318
  "hallucination_risk": 0.0,
319
+ }
320
+
321
+ # Run decision engine to get best action and alternatives
322
+ decision = decision_engine.select_optimal_action(
323
+ raw_actions, event, component=event.component,
324
+ epistemic_signals=epistemic_signals
325
+ )
326
+
327
+ # Extract risk of the selected action
328
+ risk_score = None
329
+ for alt in decision.alternatives:
330
+ if alt.action == decision.best_action:
331
+ risk_score = alt.risk
332
+ break
333
+ if risk_score is None:
334
+ # Compute risk separately
335
+ risk_score = decision_engine.compute_risk(
336
+ decision.best_action, event, event.component)
337
+
338
+ # Format alternatives (top 3 only)
339
+ alt_list = []
340
+ for alt in decision.alternatives[:3]:
341
+ alt_list.append({
342
+ "action": alt.action.value,
343
+ "expected_utility": alt.utility,
344
+ "risk": alt.risk,
345
+ })
346
+
347
+ # ── Metrics & span finalisation ───────────────────────────
348
+ _EVAL_COUNTER.labels(engine="python", status="success").inc()
349
+ _EVAL_DURATION.labels(engine="python").observe(time.monotonic() - t0)
350
+
351
+ if span:
352
+ span.set_attribute("risk_score", risk_score)
353
+ span.set_attribute("selected_action", decision.best_action.value)
354
+ span.set_attribute("expected_utility", decision.expected_utility)
355
+ span.end()
356
+
357
+ return {
358
+ "risk_score": risk_score,
359
+ "selected_action": decision.best_action.value,
360
+ "expected_utility": decision.expected_utility,
361
+ "alternatives": alt_list,
362
+ "explanation": decision.explanation,
363
+ "raw_decision": decision.raw_data,
364
+ "epistemic_signals": epistemic_signals,
365
  }
366
 
367
 
368
  def get_system_risk() -> float:
369
+ """
370
+ Return an aggregated risk score across all monitored components.
371
+ This is a placeholder – the endpoint is deprecated.
372
+ Raises NotImplementedError to avoid random fallback.
373
+ """
374
+ raise NotImplementedError(
375
+ "get_system_risk is deprecated. Use component‑level risk evaluation instead."
376
+ )
app/services/wilson_monitor.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Wilson confidence interval monitor for Rust enforcer agreement
2
+ from prometheus_client import Gauge
3
+ import math
4
+
5
+
6
+ LOWER_BOUND = Gauge(
7
+ "arf_rust_agreement_lower_bound",
8
+ "Lower 99.9% Wilson bound on agreement rate",
9
+ )
10
+
11
+
12
+ def wilson_lower(success, total, z=3.291):
13
+ """
14
+ Compute the lower bound of the Wilson confidence interval
15
+ for a binomial proportion.
16
+
17
+ Parameters
18
+ ----------
19
+ success : int
20
+ Number of agreed evaluations.
21
+ total : int
22
+ Total number of shadow evaluations (agreed + diverged).
23
+ z : float
24
+ Z‑score for the desired confidence level (default 3.291 for 99.9%).
25
+
26
+ Returns
27
+ -------
28
+ float
29
+ Lower bound of the Wilson interval, clamped to [0, 1].
30
+ """
31
+ if total == 0:
32
+ return 0.0
33
+ p = success / total
34
+ n = total
35
+ denom = 1 + z**2 / n
36
+ center = (p + z**2 / (2 * n)) / denom
37
+ margin = z * math.sqrt(p * (1 - p) / n + z**2 / (4 * n**2)) / denom
38
+ return max(0.0, center - margin)
39
+
40
+
41
+ def update(agreed, diverged):
42
+ """
43
+ Query the Prometheus agreement counters and set the lower‑bound gauge.
44
+
45
+ This function is called periodically by the background thread started
46
+ in the API lifespan (see `app/main.py`).
47
+
48
+ Parameters
49
+ ----------
50
+ agreed : int
51
+ Current value of `arf_rust_agreement_total{result="agreed"}`.
52
+ diverged : int
53
+ Current value of `arf_rust_agreement_total{result="diverged"}`.
54
+ """
55
+ lower = wilson_lower(agreed, agreed + diverged)
56
+ LOWER_BOUND.set(lower)
docker-compose.test.yml ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: '3.8'
2
+
3
+ services:
4
+ postgres:
5
+ image: postgres:15-alpine
6
+ environment:
7
+ POSTGRES_USER: testuser
8
+ POSTGRES_PASSWORD: testpass
9
+ POSTGRES_DB: testdb
10
+ ports:
11
+ - "5432:5432"
12
+ tmpfs: /var/lib/postgresql/data
docs/authentication.md ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Authentication
2
+
3
+ This page describes how to authenticate with the ARF API.
4
+
5
+ Current status
6
+
7
+ - There is no route-level or global authentication enforced by the API code in this repository. The API routes (including governance endpoints) do not validate API keys, tokens, or other credentials.
8
+
9
+ What the code provides
10
+
11
+ - The configuration model (app/core/config.py) exposes an optional `api_key` setting. This can be provided via environment variables or a `.env` file (the BaseSettings `env_file` is configured to read `.env`).
12
+
13
+ What this means for you
14
+
15
+ - Setting `API_KEY` in a `.env` file or environment variable will populate the `settings.api_key`, but the current route implementations do not check this value.
16
+ - If you require authentication, add a FastAPI dependency or middleware that checks `settings.api_key` (or another auth mechanism) and then apply it to routes or include it in a dependency override.
17
+
18
+ Suggested minimal approach to enable API key checking
19
+
20
+ - Implement a dependency in `app.api.deps` (e.g., `get_api_key`) that compares a header value to `settings.api_key` and raise `HTTPException(401)` when missing/invalid.
21
+ - Add that dependency to routers or individual endpoints where auth is required.
22
+
23
+ Notes
24
+
25
+ - Tests and example code in this repo currently run without auth.
docs/development.md ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Development
2
+
3
+ This page explains how to set up the ARF API for local development.
4
+
5
+ Requirements
6
+
7
+ - Python 3.10+ (match your environment)
8
+ - A virtual environment
9
+ - The project's Python dependencies (see `requirements.txt`). Note: `agentic-reliability-framework` is installed from a Git URL in `requirements.txt`.
10
+
11
+ Quick start
12
+
13
+ 1. Clone the repository:
14
+
15
+ git clone https://github.com/petter2025us/arf-api.git
16
+ cd arf-api
17
+
18
+ 2. Create and activate a virtualenv, then install dependencies:
19
+
20
+ python -m venv .venv
21
+ source .venv/bin/activate # or .\.venv\Scripts\activate on Windows
22
+ pip install -r requirements.txt
23
+
24
+ 3. Configure environment variables (optional):
25
+
26
+ - The project uses pydantic-settings with `env_file = ".env"` (see `app/core/config.py`). Create a `.env` file to set values locally.
27
+
28
+ Relevant environment variables used by the code:
29
+ - ARF_HMC_MODEL (default: `models/hmc_model.json`) — path to HMC model JSON used by RiskEngine.
30
+ - ARF_USE_HYPERPRIORS (default: `false`) — set to `true` to enable hyperprior behavior.
31
+ - API_KEY (optional) — will populate `settings.api_key` but note that routes currently do not enforce authentication.
32
+ - DATABASE_URL (optional) — configuration option in settings; tests use a local SQLite DB by default.
33
+
34
+ 4. Run the app with Uvicorn for development:
35
+
36
+ uvicorn app.main:app --reload --port 8000
37
+
38
+ - The application mounts routes under the `/api/v1` prefix and exposes a health endpoint at `/health`.
39
+
40
+ Running tests
41
+
42
+ - Tests use an on-disk SQLite test database (`sqlite:///./test.db`) created by the test fixtures (`tests/conftest.py`).
43
+ - To run tests:
44
+
45
+ pytest
46
+
47
+ - The test fixtures override the dependency that provides DB sessions so tests run against the test database.
48
+
49
+ Notes on the RiskEngine
50
+
51
+ - The app initializes a `RiskEngine` instance at startup (in `app.main`) using environment variables noted above. The engine instance is stored in `app.state.risk_engine` and is used by the governance endpoints.
52
+
53
+ Further development
54
+
55
+ - If you add persistent intent storage or authentication, update tests and dependency overrides accordingly.
docs/docs_endpoints.md ADDED
@@ -0,0 +1,314 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # API Endpoints
2
+
3
+ This document describes the main ARF API endpoints and the request/response contracts used by the control plane.
4
+
5
+ ## POST `/api/v1/v1/incidents/evaluate`
6
+
7
+ Evaluates a reported incident and returns a heuristic healing recommendation, a counterfactual causal explanation, and a simplified utility decision.
8
+
9
+ This endpoint is **advisory only**. It does not apply remediation, mutate infrastructure, or execute any healing action.
10
+
11
+ ### Purpose
12
+
13
+ The endpoint takes a current incident snapshot, estimates risk, chooses a deterministic action, and explains the expected effect of that action on latency using a heuristic counterfactual model.
14
+
15
+ The implementation is intentionally simple:
16
+
17
+ - no fitted Structural Causal Model is used
18
+ - no machine learning model is required
19
+ - no historical training step is performed
20
+ - no action execution is triggered
21
+
22
+ ### Request schema
23
+
24
+ The request body must match the `ReliabilityEvent` model.
25
+
26
+ ```json
27
+ {
28
+ "component": "string",
29
+ "latency_p99": "number",
30
+ "error_rate": "number",
31
+ "service_mesh": "string",
32
+ "cpu_util": "number | null",
33
+ "memory_util": "number | null"
34
+ }
35
+ ```
36
+
37
+ #### Fields
38
+
39
+ `component`
40
+ : Name of the service or component being evaluated.
41
+
42
+ `latency_p99`
43
+ : The current 99th percentile latency value. The endpoint uses this value both for risk scoring and for the causal explanation.
44
+
45
+ `error_rate`
46
+ : The current error rate. The endpoint uses this value both for risk scoring and for the deterministic action threshold.
47
+
48
+ `service_mesh`
49
+ : Optional service mesh name. Defaults to `"default"`.
50
+
51
+ `cpu_util`
52
+ : Optional CPU utilization value. Present in the request model, but not used by the current decision logic.
53
+
54
+ `memory_util`
55
+ : Optional memory utilization value. Present in the request model, but not used by the current decision logic.
56
+
57
+ ### Response schema
58
+
59
+ The endpoint returns a JSON object with three top-level sections.
60
+
61
+ ```json
62
+ {
63
+ "healing_intent": {
64
+ "action": "string",
65
+ "component": "string",
66
+ "parameters": {},
67
+ "justification": "string",
68
+ "confidence": 0.85,
69
+ "risk_score": 0.0,
70
+ "status": "oss_advisory_only"
71
+ },
72
+ "causal_explanation": {
73
+ "factual_outcome": 0.0,
74
+ "counterfactual_outcome": 0.0,
75
+ "effect": 0.0,
76
+ "explanation_text": "string",
77
+ "is_model_based": false,
78
+ "warnings": ["string"]
79
+ },
80
+ "utility_decision": {
81
+ "best_action": "string",
82
+ "expected_utility": 0.5,
83
+ "explanation": "string"
84
+ }
85
+ }
86
+ ```
87
+
88
+ #### `healing_intent`
89
+
90
+ `action`
91
+ : The selected action. In the current implementation this is either `restart_container` or `no_action`.
92
+
93
+ `component`
94
+ : The input component name.
95
+
96
+ `parameters`
97
+ : Action parameters. The current implementation returns an empty object.
98
+
99
+ `justification`
100
+ : Human-readable explanation built from the causal explanation.
101
+
102
+ `confidence`
103
+ : Fixed confidence value returned by the endpoint. The current implementation uses `0.85`.
104
+
105
+ `risk_score`
106
+ : Heuristic risk score computed from latency and error rate.
107
+
108
+ `status`
109
+ : Always `oss_advisory_only`, indicating that the response is informational and not executable.
110
+
111
+ #### `causal_explanation`
112
+
113
+ `factual_outcome`
114
+ : The observed outcome value from the request context. The endpoint uses `latency_p99` as the explained metric.
115
+
116
+ `counterfactual_outcome`
117
+ : The estimated value under the proposed alternative action.
118
+
119
+ `effect`
120
+ : The difference between counterfactual and factual outcomes.
121
+
122
+ `explanation_text`
123
+ : Natural-language explanation of the counterfactual effect.
124
+
125
+ `is_model_based`
126
+ : Always `false` in the current implementation.
127
+
128
+ `warnings`
129
+ : A list of warning strings. The current implementation includes a warning that the causal model is heuristic and not SCM-based.
130
+
131
+ #### `utility_decision`
132
+
133
+ `best_action`
134
+ : The selected action, repeated for convenience.
135
+
136
+ `expected_utility`
137
+ : Fixed utility value returned by the current implementation. The endpoint uses `0.5`.
138
+
139
+ `explanation`
140
+ : Brief explanation that the choice came from heuristic latency and error thresholds.
141
+
142
+ ### Deterministic decision logic
143
+
144
+ The endpoint uses the following rule to choose the action:
145
+
146
+ ```text
147
+ optimal_action = RESTART_CONTAINER
148
+ if latency_p99 > 500 OR error_rate > 0.15
149
+ else NO_ACTION
150
+ ```
151
+
152
+ In the implementation, this is encoded as:
153
+
154
+ - `restart_container` when `latency_p99 > 500` or `error_rate > 0.15`
155
+ - `no_action` otherwise
156
+
157
+ No probabilistic policy or learned policy is involved.
158
+
159
+ ### Heuristic risk score
160
+
161
+ The risk score is computed as:
162
+
163
+ ```text
164
+ risk = min(1.0, (latency_p99 / 1000) * 0.7 + error_rate * 0.3)
165
+ ```
166
+
167
+ Properties of this score:
168
+
169
+ - normalized to the interval `[0, 1]`
170
+ - weighted more heavily toward latency than error rate
171
+ - clipped at `1.0`
172
+
173
+ ### Counterfactual model
174
+
175
+ The causal explainer uses a deterministic multiplicative heuristic:
176
+
177
+ ```text
178
+ counterfactual_outcome = factual_outcome * (1 + effect_frac)
179
+ ```
180
+
181
+ Where:
182
+
183
+ - `factual_outcome` is the observed metric value
184
+ - `effect_frac` is read from a fixed internal action-impact table
185
+ - the effect is multiplicative, not additive
186
+
187
+ For latency, the current action-impact mapping includes the following examples:
188
+
189
+ - `restart_container` → `latency_effect = -0.15`
190
+ - `scale_out` → `latency_effect = -0.20`
191
+ - `rollback` → `latency_effect = -0.25`
192
+ - `circuit_breaker` → `latency_effect = -0.05`
193
+ - `traffic_shift` → `latency_effect = -0.10`
194
+ - `alert_team` → `latency_effect = 0.0`
195
+ - `no_action` → `latency_effect = 0.0`
196
+
197
+ For error rate, the table includes a separate `error_rate_effect` per action, but the current endpoint calls the explainer with `outcome_metric="latency"`, so the returned counterfactual explanation is latency-based.
198
+
199
+ ### Uncertainty interval
200
+
201
+ The explainer applies a fixed uncertainty margin of ±10% around the estimated effect.
202
+
203
+ Let:
204
+
205
+ ```text
206
+ effect = counterfactual_outcome - factual_outcome
207
+ ci_half = abs(effect) * 0.1
208
+ confidence_interval = (counterfactual_outcome - ci_half, counterfactual_outcome + ci_half)
209
+ ```
210
+
211
+ This interval is heuristic only. It is not a calibrated statistical confidence interval.
212
+
213
+ ### How the endpoint uses the explainer
214
+
215
+ The endpoint constructs a local state object and passes it to the explainer:
216
+
217
+ - `current_state["latency"] = event.latency_p99`
218
+ - `current_state["error_rate"] = event.error_rate`
219
+ - `current_state["last_action"] = {"action_type": "no_action"}`
220
+
221
+ It then creates:
222
+
223
+ - `proposed_action = {"action_type": optimal_action.value, "params": {}}`
224
+
225
+ and calls:
226
+
227
+ ```text
228
+ CausalExplainer().explain_healing_intent(proposed_action, current_state, "latency")
229
+ ```
230
+
231
+ The resulting explanation is embedded into the `healing_intent` response.
232
+
233
+ ### Validation and error behavior
234
+
235
+ The endpoint uses Pydantic validation through the `ReliabilityEvent` model.
236
+
237
+ Expected behavior:
238
+
239
+ - valid requests return HTTP 200
240
+ - invalid request bodies are rejected by FastAPI/Pydantic before the handler logic runs
241
+
242
+ The current implementation does not define a custom error schema for validation failures.
243
+
244
+ ### Advisory-only behavior
245
+
246
+ The response includes:
247
+
248
+ ```json
249
+ "status": "oss_advisory_only"
250
+ ```
251
+
252
+ This means:
253
+
254
+ - the endpoint recommends an action
255
+ - it does not perform the action
256
+ - it does not mutate incident state
257
+ - it does not trigger remediation workflows by itself
258
+
259
+ ### Notes on implementation scope
260
+
261
+ The current endpoint is intentionally narrow:
262
+
263
+ - it bases the action choice on only two fields: `latency_p99` and `error_rate`
264
+ - it ignores `cpu_util`, `memory_util`, and `service_mesh` in the decision logic
265
+ - it always uses the latency metric in the causal explainer call
266
+ - it returns a fixed `expected_utility` value of `0.5`
267
+
268
+ ### Example request
269
+
270
+ ```bash
271
+ curl -X POST "http://localhost:8000/api/v1/v1/incidents/evaluate" -H "Content-Type: application/json" -d '{
272
+ "component": "payment-service",
273
+ "latency_p99": 450,
274
+ "error_rate": 0.25,
275
+ "service_mesh": "default",
276
+ "cpu_util": 0.85,
277
+ "memory_util": 0.90
278
+ }'
279
+ ```
280
+
281
+ ### Example response shape
282
+
283
+ ```json
284
+ {
285
+ "healing_intent": {
286
+ "action": "restart_container",
287
+ "component": "payment-service",
288
+ "parameters": {},
289
+ "justification": "Causal: If we apply restart_container instead of no_action, latency would change from 450.00 to 382.50 (Δ = -67.50). Based on heuristic causal model.",
290
+ "confidence": 0.85,
291
+ "risk_score": 0.4575,
292
+ "status": "oss_advisory_only"
293
+ },
294
+ "causal_explanation": {
295
+ "factual_outcome": 450,
296
+ "counterfactual_outcome": 382.5,
297
+ "effect": -67.5,
298
+ "explanation_text": "If we apply restart_container instead of no_action, latency would change from 450.00 to 382.50 (Δ = -67.50). Based on heuristic causal model.",
299
+ "is_model_based": false,
300
+ "warnings": [
301
+ "Using heuristic causal model (no fitted SCM)."
302
+ ]
303
+ },
304
+ "utility_decision": {
305
+ "best_action": "restart_container",
306
+ "expected_utility": 0.5,
307
+ "explanation": "Heuristic decision based on latency/error thresholds"
308
+ }
309
+ }
310
+ ```
311
+
312
+ ### Cross-reference
313
+
314
+ See `docs/examples.md` for a worked numerical example and `README.md` for a shorter overview.
docs/endpoints.md ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # API Endpoints
2
+
3
+ This page lists all available API endpoints.
4
+
5
+ General
6
+
7
+ - All API routers are mounted under the `/api/v1` prefix (see `app.main`).
8
+ - Health endpoint is available at `/health`.
9
+
10
+ Health
11
+
12
+ - GET /health
13
+ - Returns: `{ "status": "ok" }`
14
+ - Purpose: basic liveness/health check.
15
+
16
+ Governance (risk/intent evaluation)
17
+
18
+ - POST /api/v1/intents/evaluate
19
+ - Description: Evaluate an infrastructure intent and return a risk score and explanation.
20
+ - Body: an InfrastructureIntentRequest JSON object (see the model in `app.models.infrastructure_intents`).
21
+ - Behaviour: The endpoint converts the incoming intent to an OSS intent and calls into the locally initialized RiskEngine (`app.state.risk_engine`).
22
+ - Errors: May return 500 if evaluation fails.
23
+
24
+ - POST /api/v1/intents/outcome
25
+ - Description: Record the observed outcome of an executed intent to update priors.
26
+ - Behaviour: Not implemented in this repository; the endpoint returns a `501 Not Implemented` (the current implementation raises a 501 indicating outcome recording is not yet implemented).
27
+
28
+ Other routers
29
+
30
+ - The application also registers routers for incidents, risk, intents, and history at `/api/v1` (see `app.main`). Consult the respective modules in `app.api` for their exact endpoints and payloads.
31
+
32
+ Notes
33
+
34
+ - The governance evaluation relies on a `RiskEngine` instance initialized at app startup (see `app.main`) which reads `ARF_HMC_MODEL` and `ARF_USE_HYPERPRIORS` environment variables.
docs/examples.md ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Examples
2
+
3
+ This page provides usage examples for the ARF API.
4
+
5
+ Check health
6
+
7
+ curl example:
8
+
9
+ curl http://localhost:8000/health
10
+
11
+ Response:
12
+
13
+ {
14
+ "status": "ok"
15
+ }
16
+
17
+ Evaluate an intent (governance)
18
+
19
+ - Endpoint: POST /api/v1/intents/evaluate
20
+ - Content-Type: application/json
21
+
22
+ Example payload (minimal illustrative example — adapt to the `InfrastructureIntentRequest` model used by the project):
23
+
24
+ {
25
+ "id": "intent-123",
26
+ "description": "Example infrastructure change",
27
+ "estimated_cost": 100.0,
28
+ "policy_violations": []
29
+ }
30
+
31
+ Curl example:
32
+
33
+ curl -X POST http://localhost:8000/api/v1/intents/evaluate \
34
+ -H "Content-Type: application/json" \
35
+ -d '{"id":"intent-123","description":"Example","estimated_cost":100.0,"policy_violations":[]} '
36
+
37
+ Python (requests) example:
38
+
39
+ import requests
40
+
41
+ payload = {
42
+ "id": "intent-123",
43
+ "description": "Example infrastructure change",
44
+ "estimated_cost": 100.0,
45
+ "policy_violations": []
46
+ }
47
+
48
+ resp = requests.post("http://localhost:8000/api/v1/intents/evaluate", json=payload)
49
+ print(resp.status_code, resp.text)
50
+
51
+ Notes
52
+
53
+ - The evaluate endpoint uses an in-process `RiskEngine` (initialized in `app.main`) to compute risk and explanations.
54
+ - The `/api/v1/intents/outcome` endpoint exists but currently returns 501 Not Implemented — outcome recording/storage is incomplete in this repo.
docs/index.md ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ARF API Control Plane
2
+
3
+ Welcome to the ARF API documentation.
4
+
5
+ Overview
6
+
7
+ - This repository implements the ARF API Control Plane (FastAPI) — the application mounts a number of routers under `/api/v1` and exposes a health endpoint at `/health`.
8
+ - App version (from app.main): 0.2.0
9
+
10
+ Important notes
11
+
12
+ - A `RiskEngine` is initialized at app startup and stored at `app.state.risk_engine`. The engine reads `ARF_HMC_MODEL` and `ARF_USE_HYPERPRIORS` environment variables.
13
+ - Authentication: there is an optional `api_key` in configuration, but request handlers do not currently enforce authentication.
14
+ - The `/api/v1/intents/outcome` endpoint exists but returns 501 Not Implemented; intent outcome recording/storage is not yet implemented.
15
+
16
+ See the other documentation pages for development instructions, endpoints, and examples.
monitor.sh ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ URL_FILE="/workspaces/arf-api/current_url.txt"
4
+ LOG_FILE="/workspaces/arf-api/monitor.log"
5
+
6
+ if [ ! -f "$URL_FILE" ]; then
7
+ echo "$(date): No URL file found. Exiting." >> "$LOG_FILE"
8
+ exit 1
9
+ fi
10
+
11
+ CURRENT_URL=$(cat "$URL_FILE")
12
+
13
+ if ! curl -s -f "$CURRENT_URL/health" > /dev/null; then
14
+ echo "$(date): Tunnel down. Restarting..." >> "$LOG_FILE"
15
+ /workspaces/arf-api/start.sh
16
+ else
17
+ echo "$(date): Tunnel OK." >> "$LOG_FILE"
18
+ fi
render.yaml ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ services:
2
+ - type: web
3
+ name: arf-api
4
+ runtime: python
5
+ buildCommand: pip install -r requirements.txt
6
+ startCommand: uvicorn app.main:app --host 0.0.0.0 --port $PORT
7
+ envVars:
8
+ - key: DATABASE_URL
9
+ fromDatabase:
10
+ name: arf-db
11
+ property: connectionString
12
+ - key: API_KEY
13
+ sync: false
14
+ - key: ENVIRONMENT
15
+ value: production
16
+ databases:
17
+ - name: arf-db
18
+ databaseName: arf
19
+ user: arf_user
requirements-dev.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ pytest-cov>=7.0.0
2
+ jsonschema>=4.0.0
3
+ pytest-asyncio>=0.24.0
requirements.txt CHANGED
@@ -1,8 +1,10 @@
1
  fastapi==0.115.12
2
  uvicorn[standard]==0.34.0
3
- pydantic==2.12.5
 
 
 
4
  pytest==8.3.5
5
- pytest-cov>=6.0.0
6
  httpx==0.28.1
7
  alembic
8
  pydantic-settings
@@ -11,9 +13,11 @@ psycopg2-binary==2.9.10
11
  slowapi==0.1.9
12
  prometheus-fastapi-instrumentator==7.1.0
13
  flake8==7.2.0
14
- cryptography
15
  sentence-transformers>=2.2.0
16
  scikit-learn
17
- redis>=4.0.0
18
  stripe>=9.0.0
19
- pandas
 
 
 
1
  fastapi==0.115.12
2
  uvicorn[standard]==0.34.0
3
+ pydantic>=2.13.2
4
+ agentic-reliability-framework @ git+https://github.com/arf-foundation/agentic-reliability-framework@main
5
+ arf-pricing-calculator @ git+https://github.com/arf-foundation/ARF-Bayesian-Pricing-Calculator@main
6
+ pytest==8.3.5
7
  pytest==8.3.5
 
8
  httpx==0.28.1
9
  alembic
10
  pydantic-settings
 
13
  slowapi==0.1.9
14
  prometheus-fastapi-instrumentator==7.1.0
15
  flake8==7.2.0
16
+ cryptography==47.0.0
17
  sentence-transformers>=2.2.0
18
  scikit-learn
19
+ redis>=4.0.0 # optional, for faster counters
20
  stripe>=9.0.0
21
+ opentelemetry-api>=1.20.0
22
+ opentelemetry-sdk>=1.20.0
23
+ opentelemetry-instrumentation-fastapi>=0.50b0
runtime.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ python-3.12.3
2
+ # force fresh build
seed_rag_data.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Seed RAG graph with historical healing action success rates.
3
+ Run once before starting the API server.
4
+ """
5
+ import sys
6
+ import os
7
+ sys.path.append(os.path.dirname(__file__))
8
+
9
+ from app.core.deps import get_rag_graph
10
+ from agentic_reliability_framework.core.models.event import HealingAction
11
+
12
+ def seed_historical_data():
13
+ rag = get_rag_graph()
14
+
15
+ # Define seed incidents (each with an outcome)
16
+ seed_data = [
17
+ # restart_container successes
18
+ {"incident_id": "seed_restart_1", "component": "test", "action": HealingAction.RESTART_CONTAINER.value, "success": True, "resolution_time_minutes": 2},
19
+ {"incident_id": "seed_restart_2", "component": "test", "action": HealingAction.RESTART_CONTAINER.value, "success": True, "resolution_time_minutes": 3},
20
+ {"incident_id": "seed_restart_3", "component": "test", "action": HealingAction.RESTART_CONTAINER.value, "success": False, "resolution_time_minutes": 10},
21
+
22
+ # rollback successes
23
+ {"incident_id": "seed_rollback_1", "component": "test", "action": HealingAction.ROLLBACK.value, "success": True, "resolution_time_minutes": 1},
24
+ {"incident_id": "seed_rollback_2", "component": "test", "action": HealingAction.ROLLBACK.value, "success": True, "resolution_time_minutes": 2},
25
+ {"incident_id": "seed_rollback_3", "component": "test", "action": HealingAction.ROLLBACK.value, "success": False, "resolution_time_minutes": 5},
26
+
27
+ # scale_out successes
28
+ {"incident_id": "seed_scale_1", "component": "test", "action": HealingAction.SCALE_OUT.value, "success": True, "resolution_time_minutes": 5},
29
+ {"incident_id": "seed_scale_2", "component": "test", "action": HealingAction.SCALE_OUT.value, "success": False, "resolution_time_minutes": 15},
30
+
31
+ # circuit_breaker successes
32
+ {"incident_id": "seed_cb_1", "component": "test", "action": HealingAction.CIRCUIT_BREAKER.value, "success": True, "resolution_time_minutes": 1},
33
+ {"incident_id": "seed_cb_2", "component": "test", "action": HealingAction.CIRCUIT_BREAKER.value, "success": True, "resolution_time_minutes": 2},
34
+
35
+ # traffic_shift successes
36
+ {"incident_id": "seed_ts_1", "component": "test", "action": HealingAction.TRAFFIC_SHIFT.value, "success": True, "resolution_time_minutes": 4},
37
+ {"incident_id": "seed_ts_2", "component": "test", "action": HealingAction.TRAFFIC_SHIFT.value, "success": False, "resolution_time_minutes": 8},
38
+ ]
39
+
40
+ # Add each outcome to the RAG graph
41
+ for item in seed_data:
42
+ # Create a dummy reliability event (simplified)
43
+ from agentic_reliability_framework.core.models.event import ReliabilityEvent
44
+ event = ReliabilityEvent(
45
+ component=item["component"],
46
+ latency_p99=500, # placeholder
47
+ error_rate=0.1,
48
+ service_mesh="default"
49
+ )
50
+ # Record the outcome
51
+ rag.record_outcome(
52
+ incident_id=item["incident_id"],
53
+ event=event,
54
+ action_taken=item["action"],
55
+ success=item["success"],
56
+ resolution_time_minutes=item["resolution_time_minutes"]
57
+ )
58
+ print(f"Seeded: {item['action']} -> success={item['success']}")
59
+
60
+ print(f"Seeded {len(seed_data)} historical outcomes.")
61
+ print(f"Stats per action:")
62
+ for action in HealingAction:
63
+ stats = rag.get_historical_effectiveness(action.value, component_filter="test")
64
+ print(f" {action.value}: uses={stats['total_uses']}, success_rate={stats['success_rate']:.2f}, avg_time={stats['avg_resolution_time_minutes']:.1f} min")
65
+
66
+ if __name__ == "__main__":
67
+ seed_historical_data()
start.sh ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # Set paths
4
+ BACKEND_DIR="/workspaces/arf-api"
5
+ FRONTEND_DIR="/workspaces/arf-frontend"
6
+ VENV_ACTIVATE="$BACKEND_DIR/venv/bin/activate"
7
+ CLOUDFLARED=$(which cloudflared 2>/dev/null || echo "/usr/local/bin/cloudflared")
8
+
9
+ # Kill any existing processes
10
+ echo "🛑 Stopping existing uvicorn and cloudflared..."
11
+ pkill -f uvicorn
12
+ pkill -f cloudflared
13
+ sleep 2
14
+
15
+ # Start uvicorn
16
+ echo "🚀 Starting uvicorn..."
17
+ cd "$BACKEND_DIR"
18
+ source "$VENV_ACTIVATE"
19
+ uvicorn app.main:app --host 0.0.0.0 --port 8000 --reload &
20
+ sleep 3
21
+
22
+ # Verify uvicorn is running
23
+ if ! curl -s http://localhost:8000/health >/dev/null; then
24
+ echo "❌ uvicorn failed to start. Exiting."
25
+ exit 1
26
+ fi
27
+ echo "✅ uvicorn is running."
28
+
29
+ # Start cloudflared and capture URL
30
+ echo "🌐 Starting cloudflared tunnel..."
31
+ TEMP_FILE=$(mktemp)
32
+ $CLOUDFLARED tunnel --url http://localhost:8000 2>&1 | tee "$TEMP_FILE" &
33
+
34
+ # Wait for URL to appear
35
+ echo "⏳ Waiting for tunnel URL..."
36
+ URL=""
37
+ for i in {1..30}; do
38
+ URL=$(grep -oP 'https://[a-z0-9-]+\.trycloudflare\.com' "$TEMP_FILE" | head -1)
39
+ if [ -n "$URL" ]; then
40
+ break
41
+ fi
42
+ sleep 1
43
+ done
44
+
45
+ if [ -z "$URL" ]; then
46
+ echo "❌ Failed to get tunnel URL."
47
+ exit 1
48
+ fi
49
+ echo "✅ Tunnel URL: $URL"
50
+
51
+ # Save URL for monitoring (used by monitor.sh)
52
+ echo "$URL" > /workspaces/arf-api/current_url.txt
53
+
54
+ # Update Vercel environment variable
55
+ echo "🔧 Updating Vercel environment variable..."
56
+ cd "$FRONTEND_DIR"
57
+ if command -v vercel &>/dev/null; then
58
+ vercel env rm NEXT_PUBLIC_API_URL production -y
59
+ echo "$URL" | vercel env add NEXT_PUBLIC_API_URL production
60
+ echo "🔄 Redeploying frontend..."
61
+ vercel --prod
62
+ else
63
+ echo "⚠️ Vercel CLI not installed. Please install it with: npm i -g vercel"
64
+ echo "Then manually update the env var to: $URL"
65
+ fi
66
+
67
+ echo "🎉 All done! Your new URL is: $URL"
68
+ echo "Frontend will be updated shortly. Check https://arf-frontend-sandy.vercel.app"
tests/conftest.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ pytest configuration and fixtures for ARF API tests.
3
+ """
4
+
5
+ from app.core.usage_tracker import enforce_quota, Tier
6
+ from app.api.deps import get_db
7
+ from app.database.base import Base
8
+ from app.main import app as fastapi_app
9
+ from sqlalchemy.orm import sessionmaker
10
+ from sqlalchemy import create_engine
11
+ from fastapi.testclient import TestClient
12
+ import app.core.usage_tracker
13
+ import os
14
+ import pytest
15
+
16
+ # ===== STEP 1: Set environment variables BEFORE any app imports =====
17
+ os.environ["ARF_USAGE_TRACKING"] = "false"
18
+
19
+ # Force the correct database URL for tests
20
+ os.environ["DATABASE_URL"] = "postgresql://postgres:postgres@localhost:5432/testdb"
21
+ os.environ["TEST_DATABASE_URL"] = "postgresql://postgres:postgres@localhost:5432/testdb"
22
+
23
+ # Additional PostgreSQL environment variables to prevent fallback to
24
+ # system user
25
+ os.environ["PGUSER"] = "postgres"
26
+ os.environ["PGPASSWORD"] = "postgres"
27
+ os.environ["PGHOST"] = "localhost"
28
+ os.environ["PGPORT"] = "5432"
29
+ os.environ["PGDATABASE"] = "testdb"
30
+
31
+
32
+ # ===== STEP 2: Mock the tracker module BEFORE importing app =====
33
+ class MockTracker:
34
+ def get_tier(self, api_key):
35
+ from app.core.usage_tracker import Tier
36
+
37
+ return Tier.PRO
38
+
39
+ def get_remaining_quota(self, api_key, tier):
40
+
41
+ return 1000
42
+
43
+ def consume_quota_and_log(self, record, idempotency_key=None):
44
+
45
+ return (True, None)
46
+
47
+ def increment_usage_sync(self, record, idempotency_key=None):
48
+ return True
49
+
50
+ def get_or_create_api_key(self, key, tier):
51
+
52
+ return True
53
+
54
+ def update_api_key_tier(self, key, tier):
55
+ return True
56
+
57
+ def _insert_audit_log(self, record):
58
+ pass
59
+
60
+
61
+ # Replace the tracker at the module level
62
+ app.core.usage_tracker.tracker = MockTracker()
63
+
64
+ # ===== STEP 3: Import app and database modules =====
65
+
66
+ # Force model registration (prevents "no such table" errors)
67
+
68
+ # Use the environment variable for the database URL (already set)
69
+ TEST_DATABASE_URL = os.getenv(
70
+ "TEST_DATABASE_URL",
71
+ "postgresql://postgres:postgres@localhost:5432/testdb")
72
+
73
+ if TEST_DATABASE_URL.startswith("postgresql"):
74
+ engine = create_engine(TEST_DATABASE_URL)
75
+ else:
76
+ engine = create_engine(
77
+ TEST_DATABASE_URL, connect_args={
78
+ "check_same_thread": False})
79
+
80
+ TestingSessionLocal = sessionmaker(
81
+ autocommit=False,
82
+ autoflush=False,
83
+ bind=engine)
84
+
85
+
86
+ def override_get_db():
87
+
88
+ db = TestingSessionLocal()
89
+ try:
90
+ yield db
91
+
92
+ finally:
93
+ db.close()
94
+
95
+
96
+ fastapi_app.dependency_overrides[get_db] = override_get_db
97
+
98
+ # Override enforce_quota dependency
99
+
100
+
101
+ async def mock_enforce_quota(request, api_key=None):
102
+ return {"api_key": "test_key", "tier": Tier.PRO, "remaining": 1000}
103
+ fastapi_app.dependency_overrides[enforce_quota] = mock_enforce_quota
104
+
105
+
106
+ @pytest.fixture(scope="session", autouse=True)
107
+ def setup_database():
108
+ """Create tables before any tests run."""
109
+ Base.metadata.create_all(bind=engine)
110
+ yield
111
+ Base.metadata.drop_all(bind=engine)
112
+
113
+
114
+ @pytest.fixture(scope="session")
115
+ def client():
116
+ with TestClient(fastapi_app) as test_client:
117
+ yield test_client
118
+
119
+
120
+ @pytest.fixture(scope="function")
121
+ def db_session():
122
+ """Provide a clean database session for each test."""
123
+ Base.metadata.create_all(bind=engine)
124
+ session = TestingSessionLocal()
125
+ yield session
126
+ session.rollback()
127
+ session.close()
128
+ Base.metadata.drop_all(bind=engine)
tests/test_deps.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+ from unittest.mock import patch, MagicMock
3
+ from app.api.deps import get_db
4
+
5
+
6
+ def test_get_db_closes_session():
7
+ mock_session = MagicMock()
8
+ with patch('app.api.deps.SessionLocal', return_value=mock_session):
9
+ db_gen = get_db()
10
+ db = next(db_gen)
11
+ assert db == mock_session
12
+ # Simulate an exception during request handling
13
+ with pytest.raises(Exception):
14
+ db_gen.throw(Exception("test error"))
15
+ mock_session.close.assert_called_once()
tests/test_governance.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Tests for governance endpoints: /api/v1/intents/evaluate
3
+ """
4
+
5
+
6
+ def test_evaluate_provision_intent(client):
7
+ payload = {
8
+ "intent_type": "provision_resource",
9
+ "environment": "prod",
10
+ "resource_type": "database",
11
+ "region": "eastus",
12
+ "size": "Standard",
13
+ "estimated_cost": 1200,
14
+ "policy_violations": [],
15
+ "requester": "alice",
16
+ "provenance": {},
17
+ "configuration": {}
18
+ }
19
+ response = client.post("/api/v1/intents/evaluate", json=payload)
20
+ assert response.status_code == 200, response.text
21
+ data = response.json()
22
+ assert "risk_score" in data
23
+
24
+
25
+ def test_evaluate_grant_access(client):
26
+ payload = {
27
+ "intent_type": "grant_access",
28
+ "environment": "dev",
29
+ "principal": "bob",
30
+ "permission_level": "read",
31
+ "resource_scope": "/subscriptions/123",
32
+ "estimated_cost": None,
33
+ "policy_violations": [],
34
+ "requester": "alice",
35
+ "provenance": {},
36
+ "justification": "test"
37
+ }
38
+ response = client.post("/api/v1/intents/evaluate", json=payload)
39
+ assert response.status_code == 200, response.text
40
+ data = response.json()
41
+ assert "risk_score" in data
42
+
43
+
44
+ def test_evaluate_deploy_config(client):
45
+ payload = {
46
+ "intent_type": "deploy_config",
47
+ "environment": "staging",
48
+ "service_name": "payments-api",
49
+ "change_scope": "canary",
50
+ "deployment_target": "staging",
51
+ "estimated_cost": 20,
52
+ "policy_violations": [],
53
+ "requester": "alice",
54
+ "provenance": {},
55
+ "configuration": {}
56
+ }
57
+ response = client.post("/api/v1/intents/evaluate", json=payload)
58
+ assert response.status_code == 200, response.text
59
+ data = response.json()
60
+ assert "risk_score" in data
61
+
62
+
63
+ def test_invalid_intent_type(client):
64
+ payload = {
65
+ "intent_type": "UnknownIntent",
66
+ "environment": "prod",
67
+ "requester": "alice",
68
+ "provenance": {}
69
+ }
70
+ response = client.post("/api/v1/intents/evaluate", json=payload)
71
+ assert response.status_code == 422
tests/test_healing_endpoint.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi.testclient import TestClient
2
+ from app.main import app
3
+
4
+ client = TestClient(app)
5
+
6
+
7
+ def test_healing_evaluate_endpoint():
8
+ payload = {
9
+ "event": {
10
+ "component": "my-service",
11
+ "latency_p99": 450.0,
12
+ "error_rate": 0.25,
13
+ "service_mesh": "default",
14
+ "cpu_util": 0.85,
15
+ "memory_util": 0.90
16
+ }
17
+ }
18
+ response = client.post("/api/v1/healing/evaluate", json=payload)
19
+ assert response.status_code == 200, f"Expected 200, got {
20
+ response.status_code}: {
21
+ response.text}"