Spaces:

SimranShaikh
/

code-review-env

Sleeping

App Files Files Community

SimranShaikh commited on 9 days ago

Commit

cfbd548

verified ·

1 Parent(s): 998a566

commit

Browse files

Files changed (1) hide show

inference.py +302 -0

inference.py ADDED Viewed

	@@ -0,0 +1,302 @@

+"""
+inference.py — Baseline inference script for CodeReview-Env.
+Runs an LLM agent through all 3 tasks and logs results in the
+mandatory [START] / [STEP] / [END] format required by OpenEnv evaluators.
+Environment variables required:
+  API_BASE_URL   — LLM API base URL (OpenAI-compatible)
+  MODEL_NAME     — model identifier  (e.g. gpt-4o-mini)
+  HF_TOKEN       — Hugging Face / API key
+  SPACE_URL      — URL of deployed HF Space (e.g. https://my-space.hf.space)
+                   defaults to http://localhost:7860
+"""
+import json
+import os
+import sys
+import time
+from typing import Any, Dict, List, Optional
+import httpx
+from openai import OpenAI
+# ─────────────────────────────────────────────────────────────
+# Config
+# ─────────────────────────────────────────────────────────────
+API_BASE_URL: str = os.environ.get("API_BASE_URL", "https://api.openai.com/v1")
+MODEL_NAME: str = os.environ.get("MODEL_NAME", "gpt-4o-mini")
+API_KEY: str = os.environ.get("HF_TOKEN", os.environ.get("OPENAI_API_KEY", ""))
+SPACE_URL: str = os.environ.get("SPACE_URL", "http://localhost:7860").rstrip("/")
+BENCHMARK = "CodeReview-Env"
+MAX_TOKENS = 1024
+SUCCESS_SCORE_THRESHOLD = 0.6
+TASKS = ["easy_syntax", "medium_logic", "hard_security"]
+# ─────────────────────────────────────────────────────────────
+# Structured stdout logging (MANDATORY format)
+# ─────────────────────────────────────────────────────────────
+def log_start(task: str, env: str, model: str) -> None:
+    print(
+        json.dumps({"type": "START", "task": task, "env": env, "model": model}),
+        flush=True,
+    )
+def log_step(
+    step: int,
+    action: Any,
+    reward: float,
+    done: bool,
+    error: Optional[str] = None,
+) -> None:
+    print(
+        json.dumps(
+            {
+                "type": "STEP",
+                "step": step,
+                "action": str(action)[:300],  # truncate for readability
+                "reward": reward,
+                "done": done,
+                "error": error,
+            }
+        ),
+        flush=True,
+    )
+def log_end(
+    success: bool, steps: int, score: float, rewards: List[float]
+) -> None:
+    print(
+        json.dumps(
+            {
+                "type": "END",
+                "success": success,
+                "steps": steps,
+                "score": score,
+                "rewards": rewards,
+            }
+        ),
+        flush=True,
+    )
+# ─────────────────────────────────────────────────────────────
+# Environment HTTP client (thin wrapper around the HF Space API)
+# ─────────────────────────────────────────────────────────────
+class CodeReviewEnvClient:
+    def __init__(self, base_url: str) -> None:
+        self.base_url = base_url
+        self.client = httpx.Client(timeout=60.0)
+    def reset(self, task_id: str) -> Dict:
+        r = self.client.post(f"{self.base_url}/reset", params={"task_id": task_id})
+        r.raise_for_status()
+        return r.json()
+    def step(self, action_payload: Dict) -> Dict:
+        r = self.client.post(f"{self.base_url}/step", json=action_payload)
+        r.raise_for_status()
+        return r.json()
+    def state(self) -> Dict:
+        r = self.client.get(f"{self.base_url}/state")
+        r.raise_for_status()
+        return r.json()
+    def close(self) -> None:
+        self.client.close()
+# ─────────────────────────────────────────────────────────────
+# Agent: LLM-powered code reviewer
+# ─────────────────────────────────────────────────────────────
+SYSTEM_PROMPT = """\
+You are an expert software engineer specialising in code review, debugging, \
+and security auditing. You will be shown a code snippet along with a task \
+description. Your job is to:
+1. Carefully analyse the code.
+2. Identify ALL bugs, logic errors, and security vulnerabilities.
+3. Return a structured JSON action in EXACTLY the following format:
+{
+  "identified_issues": [
+    {
+      "line_number": <int or null>,
+      "issue_type": "<syntax_error|logic_bug|security_vulnerability|performance|style>",
+      "description": "<clear description of the issue>",
+      "severity": "<low|medium|high|critical>"
+    }
+  ],
+  "suggested_fix": "<complete corrected code as a string, or null>",
+  "explanation": "<brief explanation of all findings>",
+  "done": true
+}
+Output ONLY the JSON object — no prose, no markdown fences.
+"""
+def build_user_message(obs: Dict, step: int, prev_feedback: Optional[str]) -> str:
+    parts = [
+        f"Task: {obs['task_name']} ({obs['difficulty']})",
+        f"Language: {obs['language']}",
+        f"Context: {obs['context']}",
+        "",
+        "Code to review:",
+        "```",
+        obs["code_snippet"],
+        "```",
+        f"(Step {step}/{obs['max_steps']})",
+    ]
+    if prev_feedback:
+        parts += ["", "Previous grader feedback:", prev_feedback]
+    return "\n".join(parts)
+def call_llm(llm_client: OpenAI, user_message: str) -> str:
+    try:
+        completion = llm_client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=[
+                {"role": "system", "content": SYSTEM_PROMPT},
+                {"role": "user", "content": user_message},
+            ],
+            max_tokens=MAX_TOKENS,
+            temperature=0.2,
+        )
+        return (completion.choices[0].message.content or "{}").strip()
+    except Exception as exc:
+        print(f"[DEBUG] LLM call failed: {exc}", flush=True)
+        # Fallback minimal action
+        return json.dumps({
+            "identified_issues": [],
+            "suggested_fix": None,
+            "explanation": "LLM call failed",
+            "done": True,
+        })
+def parse_action(raw: str) -> Dict:
+    """Parse LLM output to action dict. Tolerates minor formatting issues."""
+    raw = raw.strip()
+    # Strip markdown code fences if present
+    if raw.startswith("```"):
+        raw = raw.split("```")[1]
+        if raw.startswith("json"):
+            raw = raw[4:]
+    try:
+        return json.loads(raw)
+    except json.JSONDecodeError:
+        return {
+            "identified_issues": [],
+            "suggested_fix": None,
+            "explanation": raw[:500],
+            "done": True,
+        }
+# ─────────────────────────────────────────────────────────────
+# Main: run agent on all tasks
+# ─────────────────────────────────────────────────────────────
+def run_task(
+    task_id: str,
+    env_client: CodeReviewEnvClient,
+    llm_client: OpenAI,
+) -> float:
+    """Run one full episode and return the episode score [0, 1]."""
+    log_start(task=task_id, env=BENCHMARK, model=MODEL_NAME)
+    result = env_client.reset(task_id=task_id)
+    obs = result["observation"]
+    rewards: List[float] = []
+    steps_taken = 0
+    score = 0.0
+    success = False
+    max_steps = obs["max_steps"]
+    try:
+        prev_feedback: Optional[str] = None
+        for step in range(1, max_steps + 1):
+            user_msg = build_user_message(obs, step, prev_feedback)
+            raw_action = call_llm(llm_client, user_msg)
+            action_dict = parse_action(raw_action)
+            step_result = env_client.step(action_dict)
+            reward = float(step_result.get("reward", 0.0))
+            done = bool(step_result.get("done", False))
+            info = step_result.get("info", {})
+            prev_feedback = info.get("feedback")
+            rewards.append(reward)
+            steps_taken = step
+            log_step(step=step, action=action_dict.get("explanation", ""), reward=reward, done=done)
+            obs = step_result["observation"]
+            if done:
+                break
+        # Score = best single-step reward (agent submits full review each step)
+        score = max(rewards) if rewards else 0.0
+        score = min(max(score, 0.0), 1.0)
+        success = score >= SUCCESS_SCORE_THRESHOLD
+    finally:
+        log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
+    return score
+def main() -> None:
+    llm_client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
+    env_client = CodeReviewEnvClient(SPACE_URL)
+    # Wait for server to be ready (useful when running right after docker start)
+    for attempt in range(10):
+        try:
+            env_client.client.get(f"{SPACE_URL}/health").raise_for_status()
+            break
+        except Exception:
+            print(f"[DEBUG] Waiting for server... attempt {attempt+1}/10", flush=True)
+            time.sleep(3)
+    else:
+        print("[ERROR] Server did not become ready. Exiting.", flush=True)
+        sys.exit(1)
+    task_scores: Dict[str, float] = {}
+    for task_id in TASKS:
+        print(f"\n{'='*60}", flush=True)
+        print(f"Running task: {task_id}", flush=True)
+        print("=" * 60, flush=True)
+        task_scores[task_id] = run_task(task_id, env_client, llm_client)
+        time.sleep(1)
+    env_client.close()
+    # Summary
+    print("\n" + "=" * 60, flush=True)
+    print("FINAL SCORES", flush=True)
+    print("=" * 60, flush=True)
+    for task_id, s in task_scores.items():
+        status = "✅ PASS" if s >= SUCCESS_SCORE_THRESHOLD else "❌ FAIL"
+        print(f"  {task_id:25s}: {s:.4f}  {status}", flush=True)
+    overall = sum(task_scores.values()) / len(task_scores)
+    print(f"\n  Overall average: {overall:.4f}", flush=True)
+if __name__ == "__main__":
+    main()