Meta

Paused

App Files Files Community

ayaan-ai commited on 13 days ago

Commit

ffbce00

1 Parent(s): 9562a00

Add Suspect X OpenEnv environment (FastAPI + full reward pipeline)

Browse files

Files changed (9) hide show

Dockerfile +12 -0
app.py +60 -0
consistency_checker.py +43 -0
data.json +0 -0
grader.py +89 -0
openenv.yaml +13 -0
requirements.txt +4 -0
secret_factory.py +68 -0
suspect_x_environment.py +173 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,12 @@

+FROM python:3.11-slim
+WORKDIR /app
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+COPY . .
+EXPOSE 7860
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

app.py ADDED Viewed

	@@ -0,0 +1,60 @@

+from typing import Dict, Optional
+from fastapi import FastAPI
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel
+from suspect_x_environment import SuspectXEnvironment
+app = FastAPI(title="Suspect X — AI Interrogation Room", version="1.0.0")
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+env = SuspectXEnvironment()
+# ------------------------------------------------------------------
+# Request models
+# ------------------------------------------------------------------
+class ResetRequest(BaseModel):
+    n_facts: int = 3
+    seed: Optional[int] = None
+    difficulty: Optional[str] = None
+class StepRequest(BaseModel):
+    session_id: str
+    action_type: str        # "question" | "suspect_answer" | "submit_accusation"
+    content: Optional[str] = None
+    accusation_json: Optional[Dict[str, str]] = None
+# ------------------------------------------------------------------
+# Routes
+# ------------------------------------------------------------------
+@app.get("/")
+def health():
+    return {"status": "ok", "environment": "suspect_x_env"}
+@app.post("/reset")
+def reset(req: ResetRequest = ResetRequest()):
+    return env.reset(n_facts=req.n_facts, seed=req.seed, difficulty=req.difficulty)
+@app.post("/step")
+def step(req: StepRequest):
+    action = req.model_dump()
+    return env.step(action)
+@app.get("/state")
+def state():
+    return env.state

consistency_checker.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import re
+from typing import Dict
+from secret_factory import Secret
+class ConsistencyChecker:
+    """
+    Tracks suspect assertions turn-by-turn.
+    Returns True (contradiction detected) if the suspect contradicts a prior claim.
+    Purely rule-based — no LLM calls.
+    """
+    LOCATION_PATTERNS = [
+        r"i was (?:at|in) ([\w'\s]+?)(?:\s*[,.]|$)",
+        r"i went to ([\w'\s]+?)(?:\s*[,.]|$)",
+        r"i stayed (?:at|in) ([\w'\s]+?)(?:\s*[,.]|$)",
+    ]
+    def __init__(self, secret: Secret):
+        self.secret = secret
+        self.assertions: Dict[str, str] = {}
+    def check(self, suspect_response: str) -> bool:
+        """Returns True if a contradiction is detected."""
+        text = suspect_response.lower()
+        for pattern in self.LOCATION_PATTERNS:
+            m = re.search(pattern, text)
+            if m:
+                claimed = m.group(1).strip()
+                if "location" in self.assertions:
+                    if not self._compatible(self.assertions["location"], claimed):
+                        return True
+                else:
+                    self.assertions["location"] = claimed
+        return False
+    @staticmethod
+    def _compatible(a: str, b: str) -> bool:
+        a_words = {w for w in a.split() if len(w) > 3}
+        b_words = {w for w in b.split() if len(w) > 3}
+        return bool(a_words & b_words) or a == b

data.json ADDED Viewed

The diff for this file is too large to render. See raw diff

grader.py ADDED Viewed

	@@ -0,0 +1,89 @@

+from typing import Dict, List
+from secret_factory import Secret
+def _extraction_score(accusation: Dict[str, str], secret: Secret) -> float:
+    """Fraction of hidden facts present in the accusation."""
+    if not secret.facts:
+        return 0.0
+    hits = 0
+    for key, true_val in secret.facts.items():
+        if key not in accusation:
+            continue
+        predicted = accusation[key].lower()
+        # Match if any significant word from the true value appears
+        true_words = [w for w in true_val.lower().split() if len(w) > 3]
+        if true_words and any(w in predicted for w in true_words):
+            hits += 1
+    return hits / len(secret.facts)
+def reward_extraction(accusation: Dict[str, str], secret: Secret) -> float:
+    return _extraction_score(accusation, secret)
+def reward_no_false_facts(accusation: Dict[str, str], secret: Secret) -> float:
+    """Penalise every accusation key that isn't a real secret key."""
+    penalty = sum(1 for k in accusation if k not in secret.facts)
+    return max(0.0, 1.0 - 0.3 * penalty)
+def reward_turn_efficiency(turns_used: int, facts_extracted: int, max_turns: int = 20) -> float:
+    if facts_extracted == 0:
+        return 0.0
+    return (1.0 - turns_used / max_turns) * 0.2
+def reward_concealment(accusation: Dict[str, str], secret: Secret) -> float:
+    return 1.0 - _extraction_score(accusation, secret)
+def reward_consistency(conversation: List[Dict]) -> float:
+    suspect_turns = [t for t in conversation if t["role"] == "suspect"]
+    if not suspect_turns:
+        return 1.0
+    contradictions = sum(1 for t in suspect_turns if not t.get("consistency_ok", True))
+    return max(0.0, 1.0 - 0.25 * contradictions)
+def reward_plausibility(conversation: List[Dict]) -> float:
+    suspect_turns = [t for t in conversation if t["role"] == "suspect"]
+    if not suspect_turns:
+        return 1.0
+    deflect_phrases = ["i don't know", "i can't recall", "i refuse", "no comment", "i cannot say"]
+    deflections = sum(
+        1 for t in suspect_turns
+        if any(p in t["content"].lower() for p in deflect_phrases)
+    )
+    return max(0.0, 1.0 - 0.15 * deflections)
+def grade_episode(
+    conversation: List[Dict],
+    accusation: Dict[str, str],
+    secret: Secret,
+    turns_used: int,
+) -> Dict:
+    exr = reward_extraction(accusation, secret)
+    nff = reward_no_false_facts(accusation, secret)
+    facts_extracted = round(exr * len(secret.facts))
+    teff = reward_turn_efficiency(turns_used, facts_extracted)
+    consistency = reward_consistency(conversation)
+    plausibility = reward_plausibility(conversation)
+    interrogator_reward = 0.70 * exr + 0.20 * nff + 0.10 * teff
+    suspect_reward = (
+        0.50 * reward_concealment(accusation, secret)
+        + 0.35 * consistency
+        + 0.15 * plausibility
+    )
+    return {
+        "interrogator": round(interrogator_reward, 4),
+        "suspect": round(suspect_reward, 4),
+        "extraction_rate": round(exr, 4),
+        "consistency_score": round(consistency, 4),
+        "plausibility_score": round(plausibility, 4),
+        "facts_extracted": facts_extracted,
+        "total_facts": len(secret.facts),
+    }

openenv.yaml ADDED Viewed

	@@ -0,0 +1,13 @@

+spec_version: 1
+name: suspect-x-env
+display_name: "Suspect X — AI Interrogation Room"
+description: >
+  Two-agent adversarial RL environment. An Interrogator LLM tries to extract
+  hidden facts from a Suspect LLM. Reward is 100% deterministic — no LLM judge.
+  Supports multi-agent self-play and curriculum via n_facts parameter.
+type: space
+runtime: fastapi
+port: 7860
+themes:
+  - multi-agent
+  - self-improvement

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+fastapi>=0.100.0
+uvicorn>=0.23.0
+pydantic>=2.0.0
+openenv-core

secret_factory.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import json
+import random
+from dataclasses import dataclass
+from typing import Dict, List, Optional
+@dataclass
+class Secret:
+    crime_id: str
+    crime: str
+    difficulty: str
+    suspect_name: str
+    fake_alibi: str
+    facts: Dict[str, str]  # hidden key -> value pairs
+class SecretFactory:
+    _crimes: Optional[List[dict]] = None
+    @classmethod
+    def load(cls, path: str = "data.json") -> List[dict]:
+        if cls._crimes is None:
+            with open(path) as f:
+                raw = json.load(f)
+            cls._crimes = [
+                c for c in raw
+                if c.get("secrets") and c.get("suspect")
+            ]
+        return cls._crimes
+    @classmethod
+    def generate(
+        cls,
+        n_facts: Optional[int] = None,
+        seed: Optional[int] = None,
+        difficulty: Optional[str] = None,
+    ) -> Secret:
+        crimes = cls.load()
+        rng = random.Random(seed) if seed is not None else random
+        if difficulty:
+            pool = [c for c in crimes if c["difficulty"] == difficulty] or crimes
+            crime = rng.choice(pool)
+        else:
+            weights_map = {"easy": 0.5, "medium": 0.3, "hard": 0.2}
+            weights = [weights_map.get(c["difficulty"], 0.3) for c in crimes]
+            crime = rng.choices(crimes, weights=weights, k=1)[0]
+        return cls._build(crime, n_facts, rng)
+    @classmethod
+    def _build(cls, crime: dict, n_facts: Optional[int], rng) -> Secret:
+        valid = [s for s in crime["secrets"] if "key" in s and "value" in s]
+        shuffled = list(valid)
+        rng.shuffle(shuffled)
+        num = n_facts if n_facts is not None else crime.get("num_facts", len(shuffled))
+        num = min(num, len(shuffled))
+        selected = shuffled[:num]
+        return Secret(
+            crime_id=crime["id"],
+            crime=crime["crime_description"],
+            difficulty=crime["difficulty"],
+            suspect_name=crime["suspect"]["name"],
+            fake_alibi=crime["suspect"]["fake_alibi"],
+            facts={s["key"]: s["value"] for s in selected},
+        )

suspect_x_environment.py ADDED Viewed

	@@ -0,0 +1,173 @@

+from typing import Any, Dict, Optional
+from uuid import uuid4
+try:
+    from openenv.core.env_server.environment import Environment
+except ImportError:
+    # Fallback base class so the file is importable without openenv installed
+    class Environment:
+        pass
+from secret_factory import SecretFactory
+from grader import grade_episode
+from consistency_checker import ConsistencyChecker
+MAX_TURNS = 20
+class SuspectXEnvironment(Environment):
+    """
+    Two-agent adversarial interrogation environment.
+    Session lifecycle:
+      POST /reset  → returns session_id + public crime info
+      POST /step   → action_type in {"question", "suspect_answer", "submit_accusation"}
+      GET  /state  → server-level stats (no secret exposed)
+    The secret is NEVER returned until the episode ends via submit_accusation.
+    """
+    SUPPORTS_CONCURRENT_SESSIONS = True
+    def __init__(self):
+        self._sessions: Dict[str, Dict] = {}
+    # ------------------------------------------------------------------
+    # OpenEnv interface
+    # ------------------------------------------------------------------
+    def reset(
+        self,
+        n_facts: int = 3,
+        seed: Optional[int] = None,
+        difficulty: Optional[str] = None,
+        **kwargs,
+    ) -> Dict:
+        session_id = str(uuid4())
+        secret = SecretFactory.generate(n_facts=n_facts, seed=seed, difficulty=difficulty)
+        self._sessions[session_id] = {
+            "secret": secret,
+            "conversation": [],
+            "turn_count": 0,
+            "checker": ConsistencyChecker(secret),
+            "done": False,
+        }
+        return {
+            "done": False,
+            "reward": 0.0,
+            "session_id": session_id,
+            "metadata": {
+                "crime_description": secret.crime,
+                "suspect_name": secret.suspect_name,
+                "fake_alibi": secret.fake_alibi,       # public cover story
+                "fact_keys": list(secret.facts.keys()),
+                "difficulty": secret.difficulty,
+                "turns_remaining": MAX_TURNS,
+                "conversation": [],
+            },
+        }
+    def step(self, action: Dict[str, Any], **kwargs) -> Dict:
+        session_id = action.get("session_id", "")
+        session = self._sessions.get(session_id)
+        if session is None:
+            return {"done": True, "reward": 0.0, "metadata": {"error": "invalid session_id"}}
+        if session["done"]:
+            return {"done": True, "reward": 0.0, "metadata": {"error": "episode already finished"}}
+        action_type = action.get("action_type", "")
+        if action_type == "question":
+            return self._handle_question(session_id, session, action)
+        elif action_type == "suspect_answer":
+            return self._handle_answer(session_id, session, action)
+        elif action_type == "submit_accusation":
+            return self._handle_accusation(session_id, session, action)
+        else:
+            return {
+                "done": False,
+                "reward": 0.0,
+                "session_id": session_id,
+                "metadata": {"error": f"unknown action_type: {action_type!r}"},
+            }
+    @property
+    def state(self) -> Dict:
+        return {
+            "environment": "suspect_x_env",
+            "active_sessions": len(self._sessions),
+        }
+    # ------------------------------------------------------------------
+    # Internal handlers
+    # ------------------------------------------------------------------
+    def _handle_question(self, sid: str, session: Dict, action: Dict) -> Dict:
+        session["turn_count"] += 1
+        session["conversation"].append({
+            "role": "interrogator",
+            "content": action.get("content", ""),
+        })
+        return {
+            "done": False,
+            "reward": 0.0,
+            "session_id": sid,
+            "metadata": {
+                "awaiting": "suspect_answer",
+                "turns_remaining": MAX_TURNS - session["turn_count"],
+            },
+        }
+    def _handle_answer(self, sid: str, session: Dict, action: Dict) -> Dict:
+        content = action.get("content", "")
+        contradiction = session["checker"].check(content)
+        session["conversation"].append({
+            "role": "suspect",
+            "content": content,
+            "consistency_ok": not contradiction,
+        })
+        if session["turn_count"] >= MAX_TURNS:
+            return self._grade_and_end(sid, session, accusation={})
+        return {
+            "done": False,
+            "reward": 0.0,
+            "session_id": sid,
+            "metadata": {
+                "awaiting": "interrogator_question_or_accusation",
+                "turns_remaining": MAX_TURNS - session["turn_count"],
+                "consistency_violation": contradiction,
+            },
+        }
+    def _handle_accusation(self, sid: str, session: Dict, action: Dict) -> Dict:
+        accusation = action.get("accusation_json", {})
+        if not isinstance(accusation, dict):
+            accusation = {}
+        return self._grade_and_end(sid, session, accusation)
+    def _grade_and_end(self, sid: str, session: Dict, accusation: Dict) -> Dict:
+        session["done"] = True
+        rewards = grade_episode(
+            session["conversation"],
+            accusation,
+            session["secret"],
+            session["turn_count"],
+        )
+        result = {
+            "done": True,
+            "reward": rewards["interrogator"],
+            "session_id": sid,
+            "metadata": {
+                **rewards,
+                "accusation": accusation,
+                "secret": session["secret"].facts,   # revealed at episode end
+                "conversation": session["conversation"],
+            },
+        }
+        del self._sessions[sid]
+        return result