Spaces:

Dev-CrafterX
/

preference-lab

Running

App Files Files Community

Sibam commited on 6 days ago

Commit

cdf485e

0 Parent(s):

PreferenceLab OpenEnv environment for RLHF preference simulation

Browse files

Files changed (20) hide show

.dockerignore +14 -0
.gitignore +60 -0
Dockerfile +61 -0
README.md +254 -0
__init__.py +27 -0
client.py +40 -0
data/.gitkeep +0 -0
data/README.md +59 -0
inference.py +269 -0
models.py +111 -0
openenv.yaml +6 -0
pyproject.toml +34 -0
requirements.txt +8 -0
scripts/__init__.py +0 -0
scripts/prepare_datasets.py +201 -0
server/__init__.py +1 -0
server/app.py +38 -0
server/environment.py +513 -0
tests/__init__.py +0 -0
tests/test_environment.py +247 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,14 @@

+__pycache__
+*.pyc
+*.pyo
+.env
+.git
+.gitignore
+.pytest_cache
+tests/
+scripts/
+outputs/
+*.egg-info
+dist/
+build/
+README.md

.gitignore ADDED Viewed

	@@ -0,0 +1,60 @@

+# Python Bytecode
+__pycache__/
+*.pyc
+*.pyo
+*.py[cod]
+*$py.class
+# Virtual Environments
+.venv/
+venv/
+ENV/
+env/
+# Package Management
+*.egg-info/
+dist/
+build/
+*.egg
+pip-log.txt
+.pip/
+uv.lock
+uv.cache/
+# IDE & Editor
+.idea/
+.vscode/settings.json
+*.swp
+*.swo
+*~
+.DS_Store
+Thumbs.db
+# Testing & Coverage
+.pytest_cache/
+.mypy_cache/
+.coverage
+htmlcov/
+.ruff_cache/
+# Logging
+*.log
+# Environment
+.env
+.env.local
+.env.*.local
+# Project-specific Generated Data
+outputs/
+data/pairwise_data.json
+data/likert_data.json
+data/consistency_data.json
+# Machine Learning Models (if applicable)
+*.bin
+*.safetensors
+*.pt
+*.pth
+*.pkl
+*.joblib

Dockerfile ADDED Viewed

	@@ -0,0 +1,61 @@

+# Copyright (c) Team Nexis
+# PreferenceLab OpenEnv Environment
+# Based on the official openenv-base image pattern
+ARG BASE_IMAGE=ghcr.io/meta-pytorch/openenv-base:latest
+FROM ghcr.io/meta-pytorch/openenv-base:latest AS builder
+WORKDIR /app
+ARG BUILD_MODE=standalone
+# Copy environment code
+COPY . /app/env
+WORKDIR /app/env
+# Ensure uv is available
+RUN if ! command -v uv >/dev/null 2>&1; then \
+    curl -LsSf https://astral.sh/uv/install.sh | sh && \
+    mv /root/.local/bin/uv /usr/local/bin/uv && \
+    mv /root/.local/bin/uvx /usr/local/bin/uvx; \
+    fi
+# Install git (build-time only)
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    git \
+    && rm -rf /var/lib/apt/lists/*
+# Install uv fresh
+RUN curl -LsSf https://astral.sh/uv/install.sh | sh && \
+    install -m 0755 /root/.local/bin/uv /usr/local/bin/uv && \
+    install -m 0755 /root/.local/bin/uvx /usr/local/bin/uvx
+# Install dependencies (two-pass for caching)
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv sync --no-install-project --no-editable
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv sync --no-editable
+# ── Runtime stage ──────────────────────────────────────────────
+FROM ghcr.io/meta-pytorch/openenv-base:latest
+WORKDIR /app
+# Copy venv and code from builder
+COPY --from=builder /app/env/.venv /app/.venv
+COPY --from=builder /app/env /app/env
+# Environment
+ENV PATH="/app/.venv/bin:$PATH"
+ENV PYTHONPATH="/app/env:$PYTHONPATH"
+ENV ENABLE_WEB_INTERFACE=true
+ENV MAX_CONCURRENT_ENVS=64
+# Health check
+HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \
+    CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')" || exit 1
+# Start server
+CMD ["sh", "-c", "cd /app/env && uvicorn server.app:app --host 0.0.0.0 --port 8000"]

README.md ADDED Viewed

	@@ -0,0 +1,254 @@

+---
+title: PreferenceLab
+emoji: 🧪
+colorFrom: blue
+colorTo: purple
+sdk: docker
+pinned: false
+---
+# 🧪 PreferenceLab
+**An OpenEnv environment simulating the RLHF human preference data collection pipeline.**
+> Built for the Meta × Hugging Face OpenEnv Hackathon — Team Nexis
+---
+## Overview
+PreferenceLab is a real-world OpenEnv environment where an AI agent learns to judge
+the quality of LLM responses — exactly as a human annotator would in an RLHF pipeline.
+Instead of expensive, slow human annotators, the environment provides:
+- **Deterministic grading** using gold labels from real preference datasets
+- **Partial reward signals** at every step (not just binary end-of-episode)
+- **3 task difficulty levels**: pairwise → multi-axis scoring → transitive ranking
+This fills a genuine gap: zero existing OpenEnv environments simulate the RLHF
+data collection pipeline that powers models like Llama, Claude, and GPT-4.
+---
+## Action & Observation Spaces
+### Task 1 — Pairwise Ranking (Easy)
+**Observation:**
+```python
+PairwiseObservation(
+    prompt: str,          # The user instruction
+    response_a: str,      # Candidate response A
+    response_b: str,      # Candidate response B
+    reward: float,        # Last step reward
+    done: bool,
+    step_count: int,
+)
+```
+**Action:**
+```python
+PairwiseAction(
+    choice: Literal["A", "B", "tie", "skip"],
+    justification: Optional[str],  # not graded
+)
+```
+**Grader:** +1.0 correct | +0.3 skip | +0.1 tie | +0.0 wrong
+---
+### Task 2 — Multi-Axis Likert Scoring (Medium)
+**Observation:**
+```python
+LikertObservation(
+    prompt: str,
+    response: str,
+    rubric: str,          # Scoring instructions
+    reward: float,
+    done: bool,
+    step_count: int,
+)
+```
+**Action:**
+```python
+LikertAction(
+    helpfulness: int,          # 1-5
+    honesty: int,              # 1-5
+    harmlessness: int,         # 1-5
+    instruction_following: int # 1-5
+)
+```
+**Grader:** `reward = 1.0 - (MAE / 4.0)` — continuous signal based on deviation from gold scores
+---
+### Task 3 — Transitive Consistency Ranking (Hard)
+**Observation:**
+```python
+ConsistencyObservation(
+    prompt: str,
+    response_a: str,
+    response_b: str,
+    response_c: str,
+    response_d: str,
+    reward: float,
+    done: bool,
+    step_count: int,
+)
+```
+**Action:**
+```python
+ConsistencyAction(
+    ranking: list[str]   # e.g. ["C", "A", "D", "B"] best→worst
+)
+```
+**Grader:** Transitivity score (0.0–0.5) + Kendall's tau quality correlation (0.0–0.5)
+---
+## Reward Function
+| Component | Range | Description |
+|---|---|---|
+| Correctness | 0.0–1.0 | Agreement with gold label |
+| Partial credit | 0.1–0.3 | For abstain/tie actions |
+| Trajectory | cumulative | Sum over episode steps |
+Rewards are **non-sparse** — every step provides a signal. Graders are
+**deterministic** and **reproducible** — same seed = same episode.
+---
+## Setup
+### Local Development
+```bash
+git clone https://github.com/your-username/preference-lab
+cd preference-lab
+pip install -e .
+# Optional: download real datasets
+python scripts/prepare_datasets.py --samples 200
+# Run tests
+pytest tests/ -v
+# Run inference baseline
+python inference.py
+```
+### Environment Variables
+| Variable | Required | Default | Description |
+|---|---|---|---|
+| `API_BASE_URL` | Yes | `https://api-inference.huggingface.co/v1` | LLM API endpoint |
+| `MODEL_NAME` | Yes | `meta-llama/Llama-3.1-8B-Instruct` | Model to use for inference |
+| `HF_TOKEN` | Yes | — | Hugging Face API key |
+| `ENV_BASE_URL` | No | `http://localhost:8000` | Space URL for remote use |
+| `MAX_CONCURRENT_ENVS` | No | `64` | Parallel sessions supported |
+### Docker
+```bash
+docker build -t preference-lab .
+docker run -p 8000:8000 \
+    -e HF_TOKEN=$HF_TOKEN \
+    -e API_BASE_URL=https://api-inference.huggingface.co/v1 \
+    -e MODEL_NAME=meta-llama/Llama-3.1-8B-Instruct \
+    preference-lab
+```
+---
+## Using with TRL (GRPO Training)
+```python
+from preference_lab import PreferenceLabEnv, PairwiseAction
+class PreferenceEnvWrapper:
+    def __init__(self):
+        self.client = PreferenceLabEnv(base_url="https://your-space.hf.space")
+        self.reward = 0.0
+    def reset(self, **kwargs):
+        obs = self.client.reset()
+        return obs.prompt
+    def rank_responses(self, choice: str) -> str:
+        """
+        Choose which response is better.
+        Args:
+            choice: 'A' if response A is better, 'B' if B is better, 'tie' if equal.
+        Returns:
+            Feedback on the annotation quality.
+        """
+        result = self.client.step(PairwiseAction(choice=choice))
+        self.reward = result.reward
+        return f"Reward: {result.reward:.2f}"
+def reward_func(environments, **kwargs):
+    return [env.reward for env in environments]
+```
+---
+## Datasets
+| Task | Dataset | License |
+|---|---|---|
+| Pairwise | Anthropic HH-RLHF | MIT |
+| Likert | UltraFeedback | MIT |
+| Consistency | Stanford SHP | CC BY 4.0 |
+Fallback synthetic data is included for zero-dependency testing.
+---
+## Project Structure
+```
+preference-lab/
+├── __init__.py           # Package exports
+├── models.py             # Pydantic Action/Observation models
+├── client.py             # PreferenceLabEnv(MCPToolClient)
+├── inference.py          # Baseline inference script (START/STEP/END logs)
+├── openenv.yaml          # OpenEnv manifest
+├── pyproject.toml        # Dependencies
+├── Dockerfile            # Container definition
+├── requirements.txt      # Pip requirements
+├── README.md             # This file
+├── data/                 # Dataset JSONs (or synthetic fallback)
+├── scripts/
+│   └── prepare_datasets.py  # Dataset download + conversion
+├── server/
+│   ├── app.py            # FastAPI server
+│   └── environment.py    # Core environment + graders
+└── tests/
+    └── test_environment.py  # Full test suite
+```
+---
+## Team
+**Team Nexis** — Sri Sri University × Zaalima Development Pvt. Ltd.
+- Sibam Nanda (Architecture + Core Environment)
+- Spandan Kar (Backend + Deployment)
+- Shayanna Behera (Config + Documentation)
+---
+## License
+BSD 3-Clause License

__init__.py ADDED Viewed

	@@ -0,0 +1,27 @@

+"""
+PreferenceLab: An OpenEnv environment simulating the RLHF
+preference data collection pipeline.
+Usage:
+    from preference_lab import PreferenceLabEnv, PairwiseAction, LikertAction, ConsistencyAction
+"""
+from client import PreferenceLabEnv
+from models import (
+    ConsistencyAction,
+    ConsistencyObservation,
+    LikertAction,
+    LikertObservation,
+    PairwiseAction,
+    PairwiseObservation,
+)
+__all__ = [
+    "PreferenceLabEnv",
+    "PairwiseAction",
+    "PairwiseObservation",
+    "LikertAction",
+    "LikertObservation",
+    "ConsistencyAction",
+    "ConsistencyObservation",
+]

client.py ADDED Viewed

	@@ -0,0 +1,40 @@

+"""
+PreferenceLab Environment Client.
+Connects to a running PreferenceLab server via WebSocket/HTTP.
+Example (async):
+    >>> from preference_lab import PreferenceLabEnv, PairwiseAction
+    >>> async with PreferenceLabEnv(base_url="https://your-space.hf.space") as env:
+    ...     obs = await env.reset()
+    ...     result = await env.step(PairwiseAction(choice="A"))
+Example (sync):
+    >>> with PreferenceLabEnv(base_url="https://your-space.hf.space").sync() as env:
+    ...     obs = env.reset()
+    ...     result = env.step(PairwiseAction(choice="A"))
+"""
+from openenv.core.mcp_client import MCPToolClient
+class PreferenceLabEnv(MCPToolClient):
+    """
+    Client for the PreferenceLab Environment.
+    Provides tool-calling style interactions with the RLHF preference
+    simulation environment. Inherits all functionality from MCPToolClient.
+    Available tools (discovered via list_tools()):
+        - rank_responses:   Task 1 — choose A or B
+        - score_response:   Task 2 — rate on 4 axes
+        - order_responses:  Task 3 — rank 4 responses
+    Example:
+        >>> with PreferenceLabEnv(base_url="http://localhost:8000").sync() as env:
+        ...     env.reset()
+        ...     tools = env.list_tools()
+        ...     result = env.call_tool("rank_responses", choice="A")
+    """
+    pass  # MCPToolClient provides all needed functionality

data/.gitkeep ADDED Viewed

File without changes

data/README.md ADDED Viewed

	@@ -0,0 +1,59 @@

+# Data Directory
+This directory holds the preference datasets used by PreferenceLab.
+On first run, if these files are absent, the environment falls back to
+built-in synthetic examples (defined in `server/environment.py`).
+## File Format
+### pairwise_data.json
+```json
+[
+  {
+    "prompt": "...",
+    "response_a": "...",
+    "response_b": "...",
+    "gold_label": "A",
+    "source": "hh-rlhf"
+  }
+]
+```
+### likert_data.json
+```json
+[
+  {
+    "prompt": "...",
+    "response": "...",
+    "rubric": "...",
+    "gold_scores": {
+      "helpfulness": 4,
+      "honesty": 5,
+      "harmlessness": 5,
+      "instruction_following": 4
+    },
+    "source": "ultrafeedback"
+  }
+]
+```
+### consistency_data.json
+```json
+[
+  {
+    "prompt": "...",
+    "response_a": "...",
+    "response_b": "...",
+    "response_c": "...",
+    "response_d": "...",
+    "gold_ranking": ["C", "A", "B", "D"],
+    "source": "stanford-shp"
+  }
+]
+```
+## Loading Real Datasets
+Run `python scripts/prepare_datasets.py` to download and convert
+HH-RLHF, UltraFeedback, and Stanford SHP into these formats.

inference.py ADDED Viewed

	@@ -0,0 +1,269 @@

+"""
+PreferenceLab Baseline Inference Script.
+Runs a baseline LLM agent against all 3 tasks and reports scores.
+Environment variables:
+    API_BASE_URL  — LLM API endpoint (required, with default)
+    MODEL_NAME    — Model identifier (required, with default)
+    HF_TOKEN      — Hugging Face / API key (no default — injected by HF Spaces)
+    ENV_BASE_URL  — PreferenceLab Space URL (optional, defaults to localhost)
+Usage:
+    python inference.py
+    HF_TOKEN=hf_xxx MODEL_NAME=meta-llama/Llama-3.1-8B-Instruct python inference.py
+"""
+import json
+import os
+import time
+from typing import Any
+from openai import OpenAI
+# ── Environment variables (MANDATORY pattern for hackathon) ───
+API_BASE_URL = os.getenv("API_BASE_URL", "https://api-inference.huggingface.co/v1")
+MODEL_NAME = os.getenv("MODEL_NAME", "meta-llama/Llama-3.1-8B-Instruct")
+HF_TOKEN = os.getenv("HF_TOKEN")  # NO default — injected by HF Spaces
+ENV_BASE_URL = os.getenv("ENV_BASE_URL", "http://localhost:8000")
+# ── OpenAI client (MANDATORY: all LLM calls via OpenAI client) ─
+client = OpenAI(
+    api_key=HF_TOKEN,
+    base_url=API_BASE_URL,
+)
+# ── Logging helpers ───────────────────────────────────────────
+def log_start(task_name: str, task_id: str):
+    """Stdout START log — required structured format."""
+    print(f"START task_name={task_name} task_id={task_id} timestamp={time.time():.0f}")
+def log_step(task_id: str, step: int, action: dict, reward: float):
+    """Stdout STEP log — required structured format."""
+    print(f"STEP task_id={task_id} step={step} action={json.dumps(action)} reward={reward:.4f}")
+def log_end(task_id: str, total_reward: float, steps: int):
+    """Stdout END log — required structured format."""
+    print(f"END task_id={task_id} total_reward={total_reward:.4f} steps={steps}")
+# ── LLM helpers ───────────────────────────────────────────────
+def call_llm(system: str, user: str, max_tokens: int = 256) -> str:
+    """Call LLM via OpenAI client. Returns text response."""
+    try:
+        response = client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=[
+                {"role": "system", "content": system},
+                {"role": "user", "content": user},
+            ],
+            max_tokens=max_tokens,
+            temperature=0.0,  # deterministic for reproducibility
+        )
+        return response.choices[0].message.content.strip()
+    except Exception as e:
+        print(f"  [LLM ERROR] {e}")
+        return ""
+def parse_json_response(text: str, fallback: dict) -> dict:
+    """Try to parse JSON from LLM response."""
+    try:
+        # Find JSON block
+        start = text.find("{")
+        end = text.rfind("}") + 1
+        if start >= 0 and end > start:
+            return json.loads(text[start:end])
+    except Exception:
+        pass
+    return fallback
+# ── Task runners ──────────────────────────────────────────────
+def run_task1_pairwise(env_client) -> dict[str, Any]:
+    """Run Task 1: Pairwise ranking."""
+    task_id = "task1_pairwise"
+    log_start("pairwise_ranking", task_id)
+    obs = env_client.reset(task_type="pairwise")
+    total_reward = 0.0
+    steps = 0
+    SYSTEM = (
+        "You are an expert AI response evaluator. "
+        "Given a prompt and two responses (A and B), choose which is better. "
+        "Reply ONLY with valid JSON: {\"choice\": \"A\"} or {\"choice\": \"B\"} or {\"choice\": \"tie\"}."
+    )
+    for step in range(5):
+        user_prompt = (
+            f"Prompt: {obs.prompt}\n\n"
+            f"Response A:\n{obs.response_a}\n\n"
+            f"Response B:\n{obs.response_b}\n\n"
+            "Which response is better? Reply with JSON only."
+        )
+        llm_out = call_llm(SYSTEM, user_prompt)
+        parsed = parse_json_response(llm_out, {"choice": "A"})
+        choice = parsed.get("choice", "A")
+        if choice not in ("A", "B", "tie", "skip"):
+            choice = "A"
+        from models import PairwiseAction
+        action = PairwiseAction(choice=choice)
+        obs, reward, done, info = env_client.step(action)
+        total_reward += reward
+        steps += 1
+        log_step(task_id, step + 1, {"choice": choice}, reward)
+        if done:
+            break
+    log_end(task_id, total_reward, steps)
+    avg = round(total_reward / steps, 4) if steps > 0 else 0.0
+    return {"task": "pairwise", "total_reward": total_reward, "steps": steps, "avg_reward": avg}
+def run_task2_likert(env_client) -> dict[str, Any]:
+    """Run Task 2: Multi-axis Likert scoring."""
+    task_id = "task2_likert"
+    log_start("likert_scoring", task_id)
+    obs = env_client.reset(task_type="likert")
+    total_reward = 0.0
+    steps = 0
+    SYSTEM = (
+        "You are an expert AI response evaluator. Score responses on 4 axes (1=worst, 5=best):\n"
+        "- helpfulness: how useful is the response?\n"
+        "- honesty: how accurate and truthful?\n"
+        "- harmlessness: how safe and appropriate?\n"
+        "- instruction_following: how well does it follow the instruction?\n"
+        "Reply ONLY with JSON: {\"helpfulness\": 4, \"honesty\": 5, \"harmlessness\": 5, \"instruction_following\": 4}"
+    )
+    for step in range(5):
+        user_prompt = (
+            f"Prompt: {obs.prompt}\n\n"
+            f"Response:\n{obs.response}\n\n"
+            f"Rubric: {obs.rubric}\n\n"
+            "Score this response. Reply with JSON only."
+        )
+        llm_out = call_llm(SYSTEM, user_prompt)
+        parsed = parse_json_response(llm_out, {
+            "helpfulness": 3, "honesty": 3, "harmlessness": 4, "instruction_following": 3
+        })
+        def clamp(v): return max(1, min(5, int(parsed.get(v, 3))))
+        from models import LikertAction
+        action = LikertAction(
+            helpfulness=clamp("helpfulness"),
+            honesty=clamp("honesty"),
+            harmlessness=clamp("harmlessness"),
+            instruction_following=clamp("instruction_following"),
+        )
+        obs, reward, done, info = env_client.step(action)
+        total_reward += reward
+        steps += 1
+        log_step(task_id, step + 1, parsed, reward)
+        if done:
+            break
+    log_end(task_id, total_reward, steps)
+    avg = round(total_reward / steps, 4) if steps > 0 else 0.0
+    return {"task": "likert", "total_reward": total_reward, "steps": steps, "avg_reward": avg}
+def run_task3_consistency(env_client) -> dict[str, Any]:
+    """Run Task 3: Transitive consistency chain ranking."""
+    task_id = "task3_consistency"
+    log_start("consistency_ranking", task_id)
+    obs = env_client.reset(task_type="consistency")
+    total_reward = 0.0
+    steps = 0
+    SYSTEM = (
+        "You are an expert AI response evaluator. "
+        "Rank 4 responses (A, B, C, D) from best to worst. "
+        "Reply ONLY with JSON: {\"ranking\": [\"C\", \"A\", \"D\", \"B\"]} "
+        "(best first, worst last)."
+    )
+    for step in range(5):
+        user_prompt = (
+            f"Prompt: {obs.prompt}\n\n"
+            f"Response A:\n{obs.response_a}\n\n"
+            f"Response B:\n{obs.response_b}\n\n"
+            f"Response C:\n{obs.response_c}\n\n"
+            f"Response D:\n{obs.response_d}\n\n"
+            "Rank these responses from best to worst. Reply with JSON only."
+        )
+        llm_out = call_llm(SYSTEM, user_prompt)
+        parsed = parse_json_response(llm_out, {"ranking": ["A", "B", "C", "D"]})
+        ranking = parsed.get("ranking", ["A", "B", "C", "D"])
+        if not isinstance(ranking, list) or len(ranking) != 4:
+            ranking = ["A", "B", "C", "D"]
+        from models import ConsistencyAction
+        action = ConsistencyAction(ranking=ranking)
+        obs, reward, done, info = env_client.step(action)
+        total_reward += reward
+        steps += 1
+        log_step(task_id, step + 1, {"ranking": ranking}, reward)
+        if done:
+            break
+    log_end(task_id, total_reward, steps)
+    avg = round(total_reward / steps, 4) if steps > 0 else 0.0
+    return {"task": "consistency", "total_reward": total_reward, "steps": steps, "avg_reward": avg}
+# ── Main ──────────────────────────────────────────────────────
+def main():
+    print("=" * 60)
+    print("PreferenceLab Baseline Inference")
+    print(f"Model:   {MODEL_NAME}")
+    print(f"API URL: {API_BASE_URL}")
+    print(f"Env URL: {ENV_BASE_URL}")
+    print("=" * 60)
+    # Import environment directly for local run
+    # In production, connect to the HF Space via client
+    import sys
+    sys.path.insert(0, ".")
+    from server.environment import PreferenceLabEnvironment
+    env = PreferenceLabEnvironment()
+    results = []
+    results.append(run_task1_pairwise(env))
+    results.append(run_task2_likert(env))
+    results.append(run_task3_consistency(env))
+    print("\n" + "=" * 60)
+    print("RESULTS SUMMARY")
+    print("=" * 60)
+    overall = 0.0
+    for r in results:
+        print(f"  {r['task']:20s} avg_reward={r['avg_reward']:.4f}  steps={r['steps']}")
+        overall += r["avg_reward"]
+    print(f"\n  Overall avg reward: {overall / len(results):.4f}")
+    print("=" * 60)
+    return results
+if __name__ == "__main__":
+    main()

models.py ADDED Viewed

	@@ -0,0 +1,111 @@

+"""
+Data models for PreferenceLab Environment.
+PreferenceLab simulates the RLHF preference data collection pipeline.
+An AI agent learns to judge which LLM responses are better using
+gold-standard labels from real preference datasets as ground truth.
+Tasks:
+    Task 1 (Easy):   Pairwise ranking — pick A or B
+    Task 2 (Medium): Multi-axis Likert scoring
+    Task 3 (Hard):   Transitive consistency chain ranking
+"""
+from typing import Literal, Optional
+from pydantic import Field
+from openenv.core.env_server.types import Action, Observation
+# ── Actions ───────────────────────────────────────────────────
+class PairwiseAction(Action):
+    """Action for Task 1: choose which response is better."""
+    choice: Literal["A", "B", "tie", "skip"] = Field(
+        ...,
+        description="Choose 'A' if response A is better, 'B' if response B is better, "
+                    "'tie' if they are equal, or 'skip' if undecidable.",
+    )
+    justification: Optional[str] = Field(
+        default=None,
+        description="Optional reasoning for the choice (not used for grading).",
+    )
+class LikertAction(Action):
+    """Action for Task 2: score a response on 4 quality axes (1-5 each)."""
+    helpfulness: int = Field(
+        ..., ge=1, le=5,
+        description="How helpful is the response? 1=not helpful, 5=extremely helpful.",
+    )
+    honesty: int = Field(
+        ..., ge=1, le=5,
+        description="How honest and accurate is the response? 1=misleading, 5=fully honest.",
+    )
+    harmlessness: int = Field(
+        ..., ge=1, le=5,
+        description="How harmless is the response? 1=harmful, 5=completely safe.",
+    )
+    instruction_following: int = Field(
+        ..., ge=1, le=5,
+        description="How well does the response follow the instruction? 1=ignores it, 5=follows perfectly.",
+    )
+class ConsistencyAction(Action):
+    """Action for Task 3: rank 4 responses (A, B, C, D) from best to worst."""
+    ranking: list[str] = Field(
+        ...,
+        min_length=4,
+        max_length=4,
+        description="List of 4 response IDs ordered best to worst, e.g. ['B', 'A', 'D', 'C'].",
+    )
+# ── Observations ──────────────────────────────────────────────
+class PairwiseObservation(Observation):
+    """Observation for Task 1: a prompt with two candidate responses."""
+    task_id: str = Field(..., description="Unique task identifier.")
+    task_type: Literal["pairwise"] = Field(default="pairwise")
+    prompt: str = Field(..., description="The user prompt / instruction.")
+    response_a: str = Field(..., description="Candidate response A.")
+    response_b: str = Field(..., description="Candidate response B.")
+    reward: float = Field(default=0.0, description="Reward signal from last step.")
+    done: bool = Field(default=False, description="Whether the episode is complete.")
+    step_count: int = Field(default=0, description="Current step within the episode.")
+    info: dict = Field(default_factory=dict, description="Extra debug info.")
+class LikertObservation(Observation):
+    """Observation for Task 2: a prompt + single response to score on multiple axes."""
+    task_id: str = Field(..., description="Unique task identifier.")
+    task_type: Literal["likert"] = Field(default="likert")
+    prompt: str = Field(..., description="The user prompt / instruction.")
+    response: str = Field(..., description="The response to evaluate.")
+    rubric: str = Field(..., description="Scoring rubric to guide evaluation.")
+    reward: float = Field(default=0.0, description="Reward signal from last step.")
+    done: bool = Field(default=False, description="Whether the episode is complete.")
+    step_count: int = Field(default=0, description="Current step within the episode.")
+    info: dict = Field(default_factory=dict, description="Extra debug info.")
+class ConsistencyObservation(Observation):
+    """Observation for Task 3: a prompt + 4 responses to rank transitively."""
+    task_id: str = Field(..., description="Unique task identifier.")
+    task_type: Literal["consistency"] = Field(default="consistency")
+    prompt: str = Field(..., description="The user prompt / instruction.")
+    response_a: str = Field(..., description="Candidate response A.")
+    response_b: str = Field(..., description="Candidate response B.")
+    response_c: str = Field(..., description="Candidate response C.")
+    response_d: str = Field(..., description="Candidate response D.")
+    reward: float = Field(default=0.0, description="Reward signal from last step.")
+    done: bool = Field(default=False, description="Whether the episode is complete.")
+    step_count: int = Field(default=0, description="Current step within the episode.")
+    info: dict = Field(default_factory=dict, description="Extra debug info.")

openenv.yaml ADDED Viewed

	@@ -0,0 +1,6 @@

+spec_version: 1
+name: preference_lab
+type: space
+runtime: fastapi
+app: server.app:app
+port: 8000

pyproject.toml ADDED Viewed

	@@ -0,0 +1,34 @@

+[build-system]
+requires = ["setuptools>=61.0"]
+build-backend = "setuptools.backends.legacy:build"
+[project]
+name = "preference-lab"
+version = "0.1.0"
+description = "An OpenEnv environment simulating the RLHF preference data collection pipeline"
+readme = "README.md"
+license = { text = "BSD-3-Clause" }
+requires-python = ">=3.10"
+dependencies = [
+    "openenv-core>=0.2.1",
+    "fastapi>=0.104.0",
+    "uvicorn>=0.24.0",
+    "pydantic>=2.0.0",
+    "openai>=1.0.0",
+    "datasets>=2.14.0",
+    "httpx>=0.25.0",
+    "websockets>=11.0",
+]
+[project.optional-dependencies]
+dev = [
+    "pytest>=7.0",
+    "pytest-asyncio>=0.21",
+    "httpx>=0.25.0",
+]
+[tool.setuptools.packages.find]
+where = ["."]
+[tool.pytest.ini_options]
+asyncio_mode = "auto"

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+openenv-core>=0.2.1
+fastapi>=0.104.0
+uvicorn>=0.24.0
+pydantic>=2.0.0
+openai>=1.0.0
+datasets>=2.14.0
+httpx>=0.25.0
+websockets>=11.0

scripts/__init__.py ADDED Viewed

File without changes

scripts/prepare_datasets.py ADDED Viewed

	@@ -0,0 +1,201 @@

+"""
+Dataset Preparation Script.
+Downloads HH-RLHF, UltraFeedback, and Stanford SHP from Hugging Face
+and converts them into the format expected by PreferenceLab.
+Usage:
+    python scripts/prepare_datasets.py
+    python scripts/prepare_datasets.py --samples 200
+"""
+import argparse
+import json
+import random
+from pathlib import Path
+DATA_DIR = Path(__file__).parent.parent / "data"
+DATA_DIR.mkdir(exist_ok=True)
+def prepare_pairwise(n_samples: int = 100):
+    """Download Anthropic HH-RLHF and convert to pairwise format."""
+    print(f"[1/3] Preparing pairwise data (HH-RLHF, {n_samples} samples)...")
+    try:
+        from datasets import load_dataset
+        ds = load_dataset("Anthropic/hh-rlhf", split="train", streaming=True)
+        records = []
+        for i, ex in enumerate(ds):
+            if i >= n_samples:
+                break
+            # chosen = better response, rejected = worse
+            chosen = ex.get("chosen", "")
+            rejected = ex.get("rejected", "")
+            # Extract the last human turn as prompt
+            lines = chosen.split("\n\nAssistant:")
+            if len(lines) >= 2:
+                prompt_block = lines[0].replace("Human:", "").strip()
+                resp_a = lines[-1].strip()
+            else:
+                prompt_block = chosen[:100]
+                resp_a = chosen
+            rej_lines = rejected.split("\n\nAssistant:")
+            resp_b = rej_lines[-1].strip() if len(rej_lines) >= 2 else rejected
+            # Randomly swap A/B to avoid position bias, track gold
+            if random.random() < 0.5:
+                records.append({
+                    "prompt": prompt_block,
+                    "response_a": resp_a,
+                    "response_b": resp_b,
+                    "gold_label": "A",
+                    "source": "hh-rlhf",
+                })
+            else:
+                records.append({
+                    "prompt": prompt_block,
+                    "response_a": resp_b,
+                    "response_b": resp_a,
+                    "gold_label": "B",
+                    "source": "hh-rlhf",
+                })
+        out = DATA_DIR / "pairwise_data.json"
+        with open(out, "w") as f:
+            json.dump(records, f, indent=2)
+        print(f"  ✓ Saved {len(records)} pairwise examples → {out}")
+    except Exception as e:
+        print(f"  ✗ Failed: {e} — synthetic fallback will be used")
+def prepare_likert(n_samples: int = 100):
+    """Download UltraFeedback and convert to likert format."""
+    print(f"[2/3] Preparing likert data (UltraFeedback, {n_samples} samples)...")
+    try:
+        from datasets import load_dataset
+        ds = load_dataset("openbmb/UltraFeedback", split="train", streaming=True)
+        records = []
+        for i, ex in enumerate(ds):
+            if i >= n_samples:
+                break
+            instr = ex.get("instruction", "")
+            completions = ex.get("completions", [])
+            if not completions:
+                continue
+            comp = completions[0]
+            response = comp.get("response", "")
+            annots = comp.get("annotations", {})
+            def extract_score(key, default=3):
+                val = annots.get(key, {})
+                if isinstance(val, dict):
+                    raw = val.get("Rating", default)
+                elif isinstance(val, (int, float)):
+                    raw = val
+                else:
+                    raw = default
+                # UltraFeedback uses 1-5 scale
+                try:
+                    return max(1, min(5, int(raw)))
+                except Exception:
+                    return default
+            records.append({
+                "prompt": instr,
+                "response": response,
+                "rubric": (
+                    "Score on 4 axes (1=worst, 5=best): helpfulness, honesty, "
+                    "harmlessness, instruction_following."
+                ),
+                "gold_scores": {
+                    "helpfulness": extract_score("instruction_following"),
+                    "honesty": extract_score("honesty"),
+                    "harmlessness": extract_score("truthfulness", 4),
+                    "instruction_following": extract_score("instruction_following"),
+                },
+                "source": "ultrafeedback",
+            })
+        out = DATA_DIR / "likert_data.json"
+        with open(out, "w") as f:
+            json.dump(records, f, indent=2)
+        print(f"  ✓ Saved {len(records)} likert examples → {out}")
+    except Exception as e:
+        print(f"  ✗ Failed: {e} — synthetic fallback will be used")
+def prepare_consistency(n_samples: int = 60):
+    """Build 4-way ranking examples from Stanford SHP."""
+    print(f"[3/3] Preparing consistency data (Stanford SHP, {n_samples} samples)...")
+    try:
+        from datasets import load_dataset
+        ds = load_dataset("stanfordnlp/SHP", split="train", streaming=True)
+        # Group by post_id to collect multiple responses per prompt
+        grouped: dict[str, dict] = {}
+        for ex in ds:
+            pid = ex.get("post_id", "")
+            if pid not in grouped:
+                grouped[pid] = {
+                    "prompt": ex.get("history", ""),
+                    "responses": [],
+                }
+            grouped[pid]["responses"].append({
+                "text": ex.get("human_ref_A", "") or ex.get("human_ref_B", ""),
+                "score": ex.get("score_ratio", 1.0),
+            })
+            if len(grouped) >= n_samples * 3:
+                break
+        records = []
+        for pid, data in grouped.items():
+            resps = data["responses"]
+            if len(resps) < 4:
+                continue
+            # Sort by score descending = gold ranking
+            resps_sorted = sorted(resps[:4], key=lambda x: x["score"], reverse=True)
+            labels = ["A", "B", "C", "D"]
+            # Shuffle display order (not gold order)
+            shuffled = resps_sorted[:]
+            random.shuffle(shuffled)
+            id_map = {labels[i]: shuffled[i] for i in range(4)}
+            gold_ranking = sorted(labels, key=lambda l: resps_sorted.index(id_map[l]))
+            records.append({
+                "prompt": data["prompt"][:500],
+                "response_a": id_map["A"]["text"][:400],
+                "response_b": id_map["B"]["text"][:400],
+                "response_c": id_map["C"]["text"][:400],
+                "response_d": id_map["D"]["text"][:400],
+                "gold_ranking": gold_ranking,
+                "source": "stanford-shp",
+            })
+            if len(records) >= n_samples:
+                break
+        out = DATA_DIR / "consistency_data.json"
+        with open(out, "w") as f:
+            json.dump(records, f, indent=2)
+        print(f"  ✓ Saved {len(records)} consistency examples → {out}")
+    except Exception as e:
+        print(f"  ✗ Failed: {e} — synthetic fallback will be used")
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--samples", type=int, default=100,
+                        help="Number of samples per task (default: 100)")
+    args = parser.parse_args()
+    print("=" * 50)
+    print("PreferenceLab Dataset Preparation")
+    print("=" * 50)
+    prepare_pairwise(args.samples)
+    prepare_likert(args.samples)
+    prepare_consistency(args.samples // 2)
+    print("\n✓ Done. Run inference.py to test.")
+if __name__ == "__main__":
+    main()

server/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # server package

server/app.py ADDED Viewed

	@@ -0,0 +1,38 @@

+"""
+PreferenceLab FastAPI Server.
+Exposes the PreferenceLabEnvironment via the OpenEnv HTTP interface.
+Supports concurrent sessions for parallel training.
+"""
+import os
+from openenv.core.env_server import create_app
+from models import (
+    ConsistencyAction,
+    ConsistencyObservation,
+    LikertAction,
+    LikertObservation,
+    PairwiseAction,
+    PairwiseObservation,
+)
+from server.environment import PreferenceLabEnvironment
+SUPPORTS_CONCURRENT_SESSIONS: bool = True
+MAX_CONCURRENT_ENVS = int(os.environ.get("MAX_CONCURRENT_ENVS", "64"))
+def create_environment() -> PreferenceLabEnvironment:
+    """Factory function — called once per session."""
+    return PreferenceLabEnvironment()
+# Use PairwiseAction/PairwiseObservation as the primary schema.
+# The environment internally handles all three task types.
+app = create_app(
+    create_environment,
+    PairwiseAction,
+    PairwiseObservation,
+    max_concurrent_envs=MAX_CONCURRENT_ENVS,
+)

server/environment.py ADDED Viewed

	@@ -0,0 +1,513 @@

+"""
+PreferenceLab Core Environment.
+Implements the OpenEnv Environment base class with:
+    - reset()  → returns initial observation
+    - step()   → executes action, returns (observation, reward, done, info)
+    - state()  → returns episode metadata
+Three tasks:
+    Task 1 (pairwise)     - Easy:   pairwise choice graded against HH-RLHF gold labels
+    Task 2 (likert)       - Medium: multi-axis scoring graded via MSE vs UltraFeedback scores
+    Task 3 (consistency)  - Hard:   4-way ranking graded on transitivity + quality correlation
+"""
+import json
+import random
+import uuid
+from itertools import permutations
+from pathlib import Path
+from typing import Any
+from openenv.core.env_server import Environment
+from models import (
+    ConsistencyAction,
+    ConsistencyObservation,
+    LikertAction,
+    LikertObservation,
+    PairwiseAction,
+    PairwiseObservation,
+)
+# ── Dataset loading ────────────────────────────────────────────
+DATA_DIR = Path(__file__).parent.parent / "data"
+def _load_json(filename: str) -> list[dict]:
+    path = DATA_DIR / filename
+    if path.exists():
+        with open(path) as f:
+            return json.load(f)
+    return []
+# ── Graders ───────────────────────────────────────────────────
+def grade_pairwise(action: PairwiseAction, example: dict) -> tuple[float, dict]:
+    """
+    Grade Task 1: Pairwise ranking.
+    Gold label is 'A' (chosen) or 'B' (rejected) from dataset.
+    Returns:
+        1.0  → correct choice
+        0.3  → skip (abstain — partial credit)
+        0.0  → wrong choice
+        0.1  → tie (when gold is clear)
+    """
+    gold = example.get("gold_label", "A")  # 'A' = chosen is response_a
+    choice = action.choice
+    if choice == "skip":
+        reward = 0.3
+        verdict = "abstained"
+    elif choice == "tie":
+        reward = 0.1
+        verdict = "tie_when_clear"
+    elif choice == gold:
+        reward = 1.0
+        verdict = "correct"
+    else:
+        reward = 0.0
+        verdict = "incorrect"
+    return reward, {
+        "gold": gold,
+        "chosen": choice,
+        "verdict": verdict,
+        "dataset": example.get("source", "hh-rlhf"),
+    }
+def grade_likert(action: LikertAction, example: dict) -> tuple[float, dict]:
+    """
+    Grade Task 2: Multi-axis Likert scoring.
+    Compares agent's 4-axis scores to gold scores from UltraFeedback.
+    Reward = 1.0 - (mean_absolute_error / max_possible_error)
+    Max possible error per axis = 4 (1 vs 5), so max_total = 4.
+    """
+    gold_scores = example.get("gold_scores", {
+        "helpfulness": 3,
+        "honesty": 3,
+        "harmlessness": 4,
+        "instruction_following": 3,
+    })
+    axes = ["helpfulness", "honesty", "harmlessness", "instruction_following"]
+    agent_scores = {
+        "helpfulness": action.helpfulness,
+        "honesty": action.honesty,
+        "harmlessness": action.harmlessness,
+        "instruction_following": action.instruction_following,
+    }
+    errors = []
+    per_axis = {}
+    for ax in axes:
+        err = abs(agent_scores[ax] - gold_scores.get(ax, 3))
+        errors.append(err)
+        per_axis[ax] = {"agent": agent_scores[ax], "gold": gold_scores.get(ax, 3), "error": err}
+    mae = sum(errors) / len(errors)
+    max_error = 4.0  # max abs difference on 1-5 scale
+    reward = round(1.0 - (mae / max_error), 4)
+    reward = max(0.0, min(1.0, reward))
+    return reward, {
+        "mae": round(mae, 4),
+        "per_axis": per_axis,
+        "dataset": example.get("source", "ultrafeedback"),
+    }
+def grade_consistency(action: ConsistencyAction, example: dict) -> tuple[float, dict]:
+    """
+    Grade Task 3: Transitive consistency chain ranking.
+    Scoring components:
+        - Transitivity score (0.0–0.5): penalise transitive violations in the ranking
+        - Quality correlation (0.0–0.5): Kendall's tau vs gold ranking
+    Total reward = transitivity_score + quality_score (max 1.0)
+    """
+    ranking = action.ranking
+    gold_ranking = example.get("gold_ranking", ["A", "B", "C", "D"])
+    # --- Transitivity check ---
+    # For each triple (i, j, k) where i < j < k in the agent's ranking,
+    # verify that position(i) < position(j) < position(k) doesn't violate transitivity.
+    # A violation = agent says A > B and B > C but NOT A > C.
+    # Since ranking is a total order, by construction it IS transitive. But we can
+    # still penalise if ranking contains duplicates or invalid IDs.
+    valid_ids = {"A", "B", "C", "D"}
+    has_invalid = not (set(ranking) == valid_ids)
+    transitivity_score = 0.0 if has_invalid else 0.5
+    # --- Quality correlation (Kendall's tau, simplified) ---
+    if has_invalid:
+        quality_score = 0.0
+        n_concordant = 0
+        n_discordant = 0
+    else:
+        ids = ["A", "B", "C", "D"]
+        agent_pos = {r: i for i, r in enumerate(ranking)}
+        gold_pos = {r: i for i, r in enumerate(gold_ranking)}
+        n_concordant = 0
+        n_discordant = 0
+        pairs = [(ids[i], ids[j]) for i in range(4) for j in range(i + 1, 4)]
+        for x, y in pairs:
+            agent_order = agent_pos[x] < agent_pos[y]
+            gold_order = gold_pos[x] < gold_pos[y]
+            if agent_order == gold_order:
+                n_concordant += 1
+            else:
+                n_discordant += 1
+        total_pairs = n_concordant + n_discordant
+        tau = (n_concordant - n_discordant) / total_pairs if total_pairs > 0 else 0.0
+        # Normalise tau from [-1,1] to [0, 0.5]
+        quality_score = round((tau + 1.0) / 2.0 * 0.5, 4)
+    reward = round(transitivity_score + quality_score, 4)
+    reward = max(0.0, min(1.0, reward))
+    return reward, {
+        "transitivity_score": transitivity_score,
+        "quality_score": quality_score if not has_invalid else 0.0,
+        "agent_ranking": ranking,
+        "gold_ranking": gold_ranking,
+        "has_invalid_ids": has_invalid,
+        "dataset": example.get("source", "stanford-shp"),
+    }
+# ── Environment ───────────────────────────────────────────────
+TASK_TYPES = ["pairwise", "likert", "consistency"]
+MAX_STEPS_PER_EPISODE = 5
+class PreferenceLabEnvironment(Environment):
+    """
+    PreferenceLab: An RL environment simulating the RLHF preference
+    data collection pipeline.
+    Each episode consists of MAX_STEPS_PER_EPISODE annotation steps.
+    The task type is fixed per episode (chosen at reset).
+    """
+    def __init__(self):
+        self._episode_id: str = ""
+        self._step_count: int = 0
+        self._task_type: str = "pairwise"
+        self._current_example: dict = {}
+        self._cumulative_reward: float = 0.0
+        self._seed: int = 0
+        # Load datasets
+        self._pairwise_data: list[dict] = _load_json("pairwise_data.json")
+        self._likert_data: list[dict] = _load_json("likert_data.json")
+        self._consistency_data: list[dict] = _load_json("consistency_data.json")
+        # Fallback synthetic data if files not present
+        if not self._pairwise_data:
+            self._pairwise_data = _synthetic_pairwise()
+        if not self._likert_data:
+            self._likert_data = _synthetic_likert()
+        if not self._consistency_data:
+            self._consistency_data = _synthetic_consistency()
+    # ── OpenEnv API ───────────────────────────────────────────
+    def reset(self, seed: int | None = None, episode_id: str | None = None, **kwargs):
+        """
+        Reset the environment for a new episode.
+        Args:
+            seed: Optional random seed for reproducibility.
+            episode_id: Optional episode ID override.
+            **kwargs: Accepts task_type ('pairwise', 'likert', 'consistency').
+        Returns:
+            Initial observation for the episode.
+        """
+        task_type = kwargs.get("task_type", None)
+        self._seed = seed if seed is not None else random.randint(0, 10_000)
+        rng = random.Random(self._seed)
+        self._episode_id = episode_id or str(uuid.uuid4())
+        self._step_count = 0
+        self._cumulative_reward = 0.0
+        self._task_type = task_type if task_type in TASK_TYPES else rng.choice(TASK_TYPES)
+        self._current_example = self._sample_example(rng)
+        return self._build_observation(reward=0.0, done=False, info={"reset": True})
+    def step(self, action, timeout_s: float | None = None, **kwargs):
+        """
+        Execute one annotation step.
+        Args:
+            action: A PairwiseAction, LikertAction, or ConsistencyAction.
+            timeout_s: Unused — required by base class signature.
+        Returns:
+            Tuple of (observation, reward, done, info).
+        """
+        self._step_count += 1
+        # Grade the action
+        reward, info = self._grade(action)
+        self._cumulative_reward += reward
+        done = self._step_count >= MAX_STEPS_PER_EPISODE
+        # Sample next example if not done
+        if not done:
+            rng = random.Random(self._seed + self._step_count)
+            self._current_example = self._sample_example(rng)
+        obs = self._build_observation(reward=reward, done=done, info=info)
+        return obs, reward, done, info
+    def state(self) -> dict[str, Any]:
+        """Return current episode metadata."""
+        return {
+            "episode_id": self._episode_id,
+            "step_count": self._step_count,
+            "task_type": self._task_type,
+            "cumulative_reward": round(self._cumulative_reward, 4),
+            "max_steps": MAX_STEPS_PER_EPISODE,
+            "seed": self._seed,
+        }
+    # ── Internal helpers ──���───────────────────────────────────
+    def _sample_example(self, rng: random.Random) -> dict:
+        """Sample one example from the appropriate dataset."""
+        dataset = {
+            "pairwise": self._pairwise_data,
+            "likert": self._likert_data,
+            "consistency": self._consistency_data,
+        }[self._task_type]
+        return rng.choice(dataset)
+    def _grade(self, action) -> tuple[float, dict]:
+        """Dispatch to the correct grader based on task type."""
+        if self._task_type == "pairwise":
+            return grade_pairwise(action, self._current_example)
+        elif self._task_type == "likert":
+            return grade_likert(action, self._current_example)
+        elif self._task_type == "consistency":
+            return grade_consistency(action, self._current_example)
+        return 0.0, {"error": "unknown_task"}
+    def _build_observation(self, reward: float, done: bool, info: dict):
+        """Build the appropriate observation type for the current task."""
+        ex = self._current_example
+        base = {
+            "task_id": self._episode_id + f"_step{self._step_count}",
+            "reward": reward,
+            "done": done,
+            "step_count": self._step_count,
+            "info": info,
+        }
+        if self._task_type == "pairwise":
+            return PairwiseObservation(
+                **base,
+                prompt=ex.get("prompt", ""),
+                response_a=ex.get("response_a", ""),
+                response_b=ex.get("response_b", ""),
+            )
+        elif self._task_type == "likert":
+            return LikertObservation(
+                **base,
+                prompt=ex.get("prompt", ""),
+                response=ex.get("response", ""),
+                rubric=ex.get("rubric", DEFAULT_LIKERT_RUBRIC),
+            )
+        elif self._task_type == "consistency":
+            return ConsistencyObservation(
+                **base,
+                prompt=ex.get("prompt", ""),
+                response_a=ex.get("response_a", ""),
+                response_b=ex.get("response_b", ""),
+                response_c=ex.get("response_c", ""),
+                response_d=ex.get("response_d", ""),
+            )
+        raise ValueError(f"Unknown task type: {self._task_type}")
+# ── Defaults ──────────────────────────────────────────────────
+DEFAULT_LIKERT_RUBRIC = (
+    "Score the response on 4 axes (1=worst, 5=best): "
+    "Helpfulness — does it answer the question usefully? "
+    "Honesty — is it accurate and truthful? "
+    "Harmlessness — does it avoid harm? "
+    "Instruction Following — does it follow the instruction precisely?"
+)
+# ── Synthetic fallback datasets ───────────────────────────────
+def _synthetic_pairwise() -> list[dict]:
+    return [
+        {
+            "prompt": "Explain what machine learning is in simple terms.",
+            "response_a": "Machine learning is a branch of AI where computers learn patterns from data without being explicitly programmed for each task.",
+            "response_b": "Machine learning is when computers do stuff with numbers to make predictions.",
+            "gold_label": "A",
+            "source": "synthetic",
+        },
+        {
+            "prompt": "How do I improve my sleep quality?",
+            "response_a": "Try sleeping more.",
+            "response_b": "Maintain a consistent sleep schedule, avoid screens 1 hour before bed, keep your bedroom cool and dark, and limit caffeine after 2pm.",
+            "gold_label": "B",
+            "source": "synthetic",
+        },
+        {
+            "prompt": "What is the capital of France?",
+            "response_a": "Paris is the capital and largest city of France.",
+            "response_b": "France's capital city is called Paris, it is located in northern France.",
+            "gold_label": "A",
+            "source": "synthetic",
+        },
+        {
+            "prompt": "Write a haiku about autumn.",
+            "response_a": "Leaves fall silently / Crimson whispers touch the ground / Winter draws near now",
+            "response_b": "Autumn is a season. Leaves fall down. It gets cold outside.",
+            "gold_label": "A",
+            "source": "synthetic",
+        },
+        {
+            "prompt": "How do I reverse a list in Python?",
+            "response_a": "Use my_list.reverse() to reverse in-place, or my_list[::-1] to get a reversed copy.",
+            "response_b": "You can just use the reverse function.",
+            "gold_label": "A",
+            "source": "synthetic",
+        },
+        {
+            "prompt": "Is it safe to eat raw eggs?",
+            "response_a": "Raw eggs carry a risk of Salmonella contamination. While many people eat them without issue, cooking eggs eliminates this risk. Use pasteurised eggs if you want them raw.",
+            "response_b": "Yeah raw eggs are totally fine to eat, bodybuilders do it all the time.",
+            "gold_label": "A",
+            "source": "synthetic",
+        },
+        {
+            "prompt": "Summarise the French Revolution in 2 sentences.",
+            "response_a": "The French Revolution (1789–1799) was a period of radical political and social transformation in France that overthrew the monarchy, established a republic, and culminated in Napoleon's rise to power. It reshaped modern political thought by promoting ideals of liberty, equality, and popular sovereignty.",
+            "response_b": "The French Revolution happened in France. People revolted against the king.",
+            "gold_label": "A",
+            "source": "synthetic",
+        },
+        {
+            "prompt": "What is the difference between RAM and storage?",
+            "response_a": "RAM is temporary memory your computer uses to run active programs — it's fast but clears when you shut down. Storage (SSD/HDD) is permanent memory that holds your files and programs long-term.",
+            "response_b": "RAM stores temporary data while the computer is running, whereas storage permanently saves files and data even when powered off.",
+            "gold_label": "A",
+            "source": "synthetic",
+        },
+    ]
+def _synthetic_likert() -> list[dict]:
+    return [
+        {
+            "prompt": "Explain photosynthesis to a 10-year-old.",
+            "response": "Photosynthesis is how plants make food! They use sunlight, water from the soil, and carbon dioxide from the air. Inside their leaves, they turn all this into glucose (their food) and release oxygen — which is the air we breathe!",
+            "rubric": DEFAULT_LIKERT_RUBRIC,
+            "gold_scores": {"helpfulness": 5, "honesty": 5, "harmlessness": 5, "instruction_following": 5},
+            "source": "synthetic",
+        },
+        {
+            "prompt": "What are three tips for reducing stress?",
+            "response": "Here are some tips: exercise regularly, practice meditation, and get enough sleep. These have been shown to reduce cortisol levels and improve mood.",
+            "rubric": DEFAULT_LIKERT_RUBRIC,
+            "gold_scores": {"helpfulness": 4, "honesty": 5, "harmlessness": 5, "instruction_following": 4},
+            "source": "synthetic",
+        },
+        {
+            "prompt": "How do vaccines work?",
+            "response": "Vaccines train your immune system by introducing a harmless piece of a pathogen (or instructions to make one). Your body learns to recognise and fight it, so if you encounter the real thing later, your immune system responds quickly.",
+            "rubric": DEFAULT_LIKERT_RUBRIC,
+            "gold_scores": {"helpfulness": 5, "honesty": 5, "harmlessness": 5, "instruction_following": 5},
+            "source": "synthetic",
+        },
+        {
+            "prompt": "Write a professional email declining a job offer.",
+            "response": "Subject: Re: Job Offer — [Position Name]\n\nDear [Hiring Manager],\n\nThank you so much for offering me the [Position] role at [Company]. After careful consideration, I have decided to decline the offer as I have accepted another position that more closely aligns with my current career goals.\n\nI truly appreciate the time you and your team invested in the interview process and hope our paths cross again in the future.\n\nBest regards,\n[Your Name]",
+            "rubric": DEFAULT_LIKERT_RUBRIC,
+            "gold_scores": {"helpfulness": 5, "honesty": 5, "harmlessness": 5, "instruction_following": 5},
+            "source": "synthetic",
+        },
+        {
+            "prompt": "Explain blockchain in simple terms.",
+            "response": "A blockchain is like a shared notebook that thousands of computers all keep a copy of. Every new entry (transaction) gets added in a block, chained to the previous one. Because everyone has a copy, no single person can secretly change it.",
+            "rubric": DEFAULT_LIKERT_RUBRIC,
+            "gold_scores": {"helpfulness": 5, "honesty": 4, "harmlessness": 5, "instruction_following": 5},
+            "source": "synthetic",
+        },
+        {
+            "prompt": "List 5 healthy breakfast options.",
+            "response": "1. Oatmeal with berries and nuts\n2. Greek yogurt with honey and banana\n3. Avocado toast with eggs\n4. Smoothie with spinach, protein powder, and almond milk\n5. Whole grain cereal with low-fat milk",
+            "rubric": DEFAULT_LIKERT_RUBRIC,
+            "gold_scores": {"helpfulness": 5, "honesty": 5, "harmlessness": 5, "instruction_following": 5},
+            "source": "synthetic",
+        },
+    ]
+def _synthetic_consistency() -> list[dict]:
+    return [
+        {
+            "prompt": "Explain how to use Python decorators.",
+            "response_a": "Decorators are functions that wrap other functions to add behaviour. Use @decorator_name above a function definition. Example: @staticmethod, @property, or custom ones with functools.wraps.",
+            "response_b": "Decorators wrap functions.",
+            "response_c": "Python decorators use the @ symbol and are a design pattern for extending function behavior without modifying the function itself. They take a function as input and return a modified version.",
+            "response_d": "You put @ before a function name.",
+            "gold_ranking": ["C", "A", "B", "D"],
+            "source": "synthetic",
+        },
+        {
+            "prompt": "What causes climate change?",
+            "response_a": "Climate change is primarily caused by human activities that release greenhouse gases — mainly CO2 from burning fossil fuels, methane from agriculture and landfills, and N2O from fertilisers. These gases trap heat in the atmosphere.",
+            "response_b": "The sun causes climate change.",
+            "response_c": "Many factors contribute to climate change including greenhouse gas emissions from industry, deforestation which reduces carbon absorption, and agricultural practices. The IPCC confirms human activity is the dominant cause since the mid-20th century.",
+            "response_d": "Climate change happens because of pollution.",
+            "gold_ranking": ["C", "A", "D", "B"],
+            "source": "synthetic",
+        },
+        {
+            "prompt": "How does the internet work?",
+            "response_a": "The internet is a global network of computers connected via physical cables (fiber, copper) and wireless signals. Data travels in packets using TCP/IP protocols, routed through servers and ISPs to reach its destination.",
+            "response_b": "Computers connect together and send data.",
+            "response_c": "Internet works through IP addresses.",
+            "response_d": "The internet is a massive network where data is broken into packets, routed through interconnected servers using protocols like TCP/IP and HTTP, and reassembled at the destination. DNS translates domain names to IP addresses.",
+            "gold_ranking": ["D", "A", "C", "B"],
+            "source": "synthetic",
+        },
+        {
+            "prompt": "Describe the water cycle.",
+            "response_a": "The water cycle involves evaporation, condensation, and precipitation. Water evaporates from oceans and lakes, forms clouds, then falls as rain or snow.",
+            "response_b": "Water goes up and comes down.",
+            "response_c": "The water cycle (hydrological cycle) is the continuous movement of water through Earth's systems: evaporation from surface water, transpiration from plants, condensation into clouds, precipitation, surface runoff, and groundwater infiltration before returning to oceans.",
+            "response_d": "Water evaporates and rains.",
+            "gold_ranking": ["C", "A", "D", "B"],
+            "source": "synthetic",
+        },
+        {
+            "prompt": "Explain the difference between HTTP and HTTPS.",
+            "response_a": "HTTPS is like HTTP but secure.",
+            "response_b": "HTTP is the protocol for transferring web data. HTTPS adds SSL/TLS encryption, meaning data is encrypted in transit. This prevents eavesdropping and verifies server identity via certificates. Always use HTTPS for sensitive data.",
+            "response_c": "HTTP transfers web pages. HTTPS encrypts the connection using TLS, protecting data from interception. The S stands for Secure.",
+            "response_d": "HTTPS has a padlock icon in browsers.",
+            "gold_ranking": ["B", "C", "A", "D"],
+            "source": "synthetic",
+        },
+    ]

tests/__init__.py ADDED Viewed

File without changes

tests/test_environment.py ADDED Viewed

	@@ -0,0 +1,247 @@

+"""
+Tests for PreferenceLab environment.
+Run: pytest tests/ -v
+"""
+import sys
+import os
+sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
+import pytest
+from models import (
+    PairwiseAction, LikertAction, ConsistencyAction,
+    PairwiseObservation, LikertObservation, ConsistencyObservation,
+)
+from server.environment import (
+    PreferenceLabEnvironment,
+    grade_pairwise, grade_likert, grade_consistency,
+)
+# ── Grader unit tests ─────────────────────────────────────────
+class TestPairwiseGrader:
+    def test_correct_choice_scores_1(self):
+        action = PairwiseAction(choice="A")
+        example = {"gold_label": "A", "source": "test"}
+        reward, info = grade_pairwise(action, example)
+        assert reward == 1.0
+        assert info["verdict"] == "correct"
+    def test_wrong_choice_scores_0(self):
+        action = PairwiseAction(choice="B")
+        example = {"gold_label": "A", "source": "test"}
+        reward, info = grade_pairwise(action, example)
+        assert reward == 0.0
+        assert info["verdict"] == "incorrect"
+    def test_skip_scores_partial(self):
+        action = PairwiseAction(choice="skip")
+        example = {"gold_label": "A", "source": "test"}
+        reward, info = grade_pairwise(action, example)
+        assert reward == 0.3
+    def test_tie_scores_low(self):
+        action = PairwiseAction(choice="tie")
+        example = {"gold_label": "A", "source": "test"}
+        reward, info = grade_pairwise(action, example)
+        assert reward == 0.1
+    def test_reward_in_range(self):
+        for choice in ["A", "B", "tie", "skip"]:
+            action = PairwiseAction(choice=choice)
+            reward, _ = grade_pairwise(action, {"gold_label": "A", "source": "test"})
+            assert 0.0 <= reward <= 1.0
+class TestLikertGrader:
+    def test_perfect_scores_reward_1(self):
+        action = LikertAction(helpfulness=5, honesty=5, harmlessness=5, instruction_following=5)
+        example = {
+            "gold_scores": {"helpfulness": 5, "honesty": 5, "harmlessness": 5, "instruction_following": 5},
+            "source": "test",
+        }
+        reward, info = grade_likert(action, example)
+        assert reward == 1.0
+    def test_worst_scores_reward_0(self):
+        action = LikertAction(helpfulness=1, honesty=1, harmlessness=1, instruction_following=1)
+        example = {
+            "gold_scores": {"helpfulness": 5, "honesty": 5, "harmlessness": 5, "instruction_following": 5},
+            "source": "test",
+        }
+        reward, info = grade_likert(action, example)
+        assert reward == 0.0
+    def test_partial_error_gives_partial_reward(self):
+        action = LikertAction(helpfulness=4, honesty=4, harmlessness=4, instruction_following=4)
+        example = {
+            "gold_scores": {"helpfulness": 5, "honesty": 5, "harmlessness": 5, "instruction_following": 5},
+            "source": "test",
+        }
+        reward, info = grade_likert(action, example)
+        assert 0.0 < reward < 1.0
+    def test_reward_always_in_range(self):
+        import random
+        for _ in range(20):
+            action = LikertAction(
+                helpfulness=random.randint(1, 5),
+                honesty=random.randint(1, 5),
+                harmlessness=random.randint(1, 5),
+                instruction_following=random.randint(1, 5),
+            )
+            example = {
+                "gold_scores": {
+                    "helpfulness": random.randint(1, 5),
+                    "honesty": random.randint(1, 5),
+                    "harmlessness": random.randint(1, 5),
+                    "instruction_following": random.randint(1, 5),
+                }
+            }
+            reward, _ = grade_likert(action, example)
+            assert 0.0 <= reward <= 1.0, f"Reward out of range: {reward}"
+class TestConsistencyGrader:
+    def test_perfect_ranking_scores_1(self):
+        action = ConsistencyAction(ranking=["A", "B", "C", "D"])
+        example = {"gold_ranking": ["A", "B", "C", "D"], "source": "test"}
+        reward, info = grade_consistency(action, example)
+        assert reward == 1.0
+    def test_reversed_ranking_scores_low(self):
+        action = ConsistencyAction(ranking=["D", "C", "B", "A"])
+        example = {"gold_ranking": ["A", "B", "C", "D"], "source": "test"}
+        reward, info = grade_consistency(action, example)
+        # Transitivity score = 0.5 (ranking is still a valid total order)
+        # Quality score = 0.0 (worst possible Kendall tau = -1 → normalized to 0)
+        # Total = 0.5 — strictly less than perfect score of 1.0
+        assert reward < 1.0
+        assert info["quality_score"] == 0.0
+    def test_invalid_ids_scores_low(self):
+        action = ConsistencyAction(ranking=["A", "B", "C", "X"])
+        example = {"gold_ranking": ["A", "B", "C", "D"], "source": "test"}
+        reward, info = grade_consistency(action, example)
+        assert reward == 0.0
+        assert info["has_invalid_ids"] is True
+    def test_reward_always_in_range(self):
+        import itertools
+        import random
+        ids = ["A", "B", "C", "D"]
+        gold = ["A", "B", "C", "D"]
+        for perm in itertools.permutations(ids):
+            action = ConsistencyAction(ranking=list(perm))
+            example = {"gold_ranking": gold, "source": "test"}
+            reward, _ = grade_consistency(action, example)
+            assert 0.0 <= reward <= 1.0, f"Reward out of range: {reward} for {perm}"
+    def test_graders_not_always_same_score(self):
+        """Critical: graders must NOT always return the same score."""
+        action_correct = ConsistencyAction(ranking=["A", "B", "C", "D"])
+        action_wrong = ConsistencyAction(ranking=["D", "C", "B", "A"])
+        example = {"gold_ranking": ["A", "B", "C", "D"], "source": "test"}
+        r1, _ = grade_consistency(action_correct, example)
+        r2, _ = grade_consistency(action_wrong, example)
+        assert r1 != r2, "Grader must return different scores for different inputs!"
+# ── Environment integration tests ─────────────────────────────
+class TestPreferenceLabEnvironment:
+    def setup_method(self):
+        self.env = PreferenceLabEnvironment()
+    def test_reset_returns_observation(self):
+        obs = self.env.reset()
+        assert obs is not None
+        assert hasattr(obs, "prompt")
+        assert hasattr(obs, "reward")
+        assert hasattr(obs, "done")
+    def test_reset_pairwise_returns_pairwise_obs(self):
+        obs = self.env.reset(task_type="pairwise")
+        assert isinstance(obs, PairwiseObservation)
+        assert obs.response_a != ""
+        assert obs.response_b != ""
+    def test_reset_likert_returns_likert_obs(self):
+        obs = self.env.reset(task_type="likert")
+        assert isinstance(obs, LikertObservation)
+        assert obs.response != ""
+        assert obs.rubric != ""
+    def test_reset_consistency_returns_consistency_obs(self):
+        obs = self.env.reset(task_type="consistency")
+        assert isinstance(obs, ConsistencyObservation)
+        assert obs.response_a != ""
+        assert obs.response_d != ""
+    def test_step_pairwise(self):
+        self.env.reset(task_type="pairwise")
+        action = PairwiseAction(choice="A")
+        obs, reward, done, info = self.env.step(action)
+        assert isinstance(obs, PairwiseObservation)
+        assert 0.0 <= reward <= 1.0
+        assert isinstance(done, bool)
+    def test_step_likert(self):
+        self.env.reset(task_type="likert")
+        action = LikertAction(helpfulness=4, honesty=4, harmlessness=5, instruction_following=4)
+        obs, reward, done, info = self.env.step(action)
+        assert isinstance(obs, LikertObservation)
+        assert 0.0 <= reward <= 1.0
+    def test_step_consistency(self):
+        self.env.reset(task_type="consistency")
+        action = ConsistencyAction(ranking=["A", "B", "C", "D"])
+        obs, reward, done, info = self.env.step(action)
+        assert isinstance(obs, ConsistencyObservation)
+        assert 0.0 <= reward <= 1.0
+    def test_episode_terminates_after_max_steps(self):
+        self.env.reset(task_type="pairwise")
+        done = False
+        steps = 0
+        while not done:
+            _, _, done, _ = self.env.step(PairwiseAction(choice="A"))
+            steps += 1
+            assert steps <= 10, "Episode ran too long!"
+        assert done is True
+    def test_state_returns_metadata(self):
+        self.env.reset(seed=42, task_type="pairwise")
+        state = self.env.state()
+        assert "episode_id" in state
+        assert "step_count" in state
+        assert "task_type" in state
+        assert state["seed"] == 42
+    def test_reproducible_with_seed(self):
+        obs1 = self.env.reset(seed=123, task_type="pairwise")
+        obs2 = self.env.reset(seed=123, task_type="pairwise")
+        assert obs1.prompt == obs2.prompt
+        assert obs1.response_a == obs2.response_a
+    def test_rewards_vary_across_actions(self):
+        """Ensure graders do NOT always return the same reward (disqualification check)."""
+        rewards = set()
+        for _ in range(5):
+            self.env.reset(task_type="pairwise")
+            action_a = PairwiseAction(choice="A")
+            _, r1, _, _ = self.env.step(action_a)
+            self.env.reset(task_type="pairwise")
+            action_b = PairwiseAction(choice="B")
+            _, r2, _, _ = self.env.step(action_b)
+            rewards.add(r1)
+            rewards.add(r2)
+        assert len(rewards) > 1, "Grader always returns the same score — DISQUALIFICATION!"
+    def test_all_three_tasks_run(self):
+        for task in ["pairwise", "likert", "consistency"]:
+            obs = self.env.reset(task_type=task)
+            assert obs is not None
+            state = self.env.state()
+            assert state["task_type"] == task