Spaces:

vishaldhakad
/

SecureCodeEnv

Sleeping

App Files Files Community

vishaldhakad commited on 15 days ago

Commit

e301abd

1 Parent(s): e7d44a8

fix: add pyproject.toml for openenv validate

Browse files

Files changed (1) hide show

inference.py +82 -143

inference.py CHANGED Viewed

@@ -1,15 +1,6 @@
 """
 SecureCodeEnv - Baseline Inference Script
 Required by hackathon. Runs an LLM agent through the environment.
-Usage:
-    export API_BASE_URL=https://api.openai.com/v1
-    export MODEL_NAME=gpt-4o-mini
-    export HF_TOKEN=hf_your_token
-    export ENV_URL=http://localhost:7860  # or your HF Space URL
-    python inference.py
-Completes in under 20 minutes on 2 vCPU / 8GB RAM.
 """
 import os
 import json
@@ -17,19 +8,24 @@ import time
 import sys
 import requests
 from openai import OpenAI
-# ── Required environment variables ──────────────────────────────────────────
 API_BASE_URL = os.environ.get("API_BASE_URL", "https://api.openai.com/v1")
-MODEL_NAME   = os.environ.get("MODEL_NAME", "gpt-4o-mini")
-HF_TOKEN     = os.environ.get("HF_TOKEN", "")
-ENV_URL      = os.environ.get("ENV_URL", "http://localhost:7860")
 if not HF_TOKEN:
     print("⚠️  HF_TOKEN not set. Some model endpoints may reject requests.", file=sys.stderr)
 client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN or "sk-placeholder")
-# ── System prompt ─────────────────────────────────────────────────────────
 SYSTEM_PROMPT = """You are a senior Python security engineer.
 You write production-ready, secure Python code with no shortcuts.
@@ -42,17 +38,24 @@ Rules:
 6. Use hmac.compare_digest for secret comparison (not ==).
 7. Validate all inputs — handle None, empty string, type errors.
 8. Add type hints and docstrings to every function.
-9. Follow the naming and style conventions shown in CODEBASE CONTEXT.
-10. Use pathlib.Path.resolve() for file path validation (not string checks)."""
-def run_episode(difficulty: str = "medium") -> dict:
     """Run one full episode at the given difficulty and return results."""
-    print(f"\n{'='*60}")
-    print(f"  Episode: {difficulty.upper()}")
-    print(f"{'='*60}")
-    # ── Step 1: Reset environment ─────────────────────────────────────────
     try:
         reset_resp = requests.post(
             f"{ENV_URL}/reset",
@@ -60,178 +63,114 @@ def run_episode(difficulty: str = "medium") -> dict:
             timeout=30,
         )
         reset_resp.raise_for_status()
-    except requests.RequestException as e:
         print(f"❌ /reset failed: {e}")
-        return {"task": "unknown", "scores": [], "final_score": 0.0, "improved": False, "error": str(e)}
     episode = reset_resp.json()
     sid = episode["session_id"]
     task_id = episode["task_id"]
-    print(f"  Task: {task_id}")
-    print(f"  CWE targets: {episode.get('cwe_targets', [])}")
-    scores_history = []
-    prev_feedback = {}
     for step_num in range(5):
-        # ── Step 2: Build prompt ──────────────────────────────────────────
         context = episode.get("codegraph", {})
         context_prompt = context.get("context_prompt", "")
-        # Cap context at 3000 chars to stay within token budget
-        context_str = context_prompt[:3000] if context_prompt else json.dumps(context, indent=2)[:2000]
-        feedback_str = ""
-        if prev_feedback:
-            feedback_str = "\n\nPREVIOUS ATTEMPT FEEDBACK:\n" + "\n".join(
-                f"  {k}: {v}" for k, v in prev_feedback.items() if v
-            )
-        user_message = f"""Task: {episode['problem_statement']}
-Security targets: {episode.get('cwe_targets', [])}
-{context_str}
-{feedback_str}
-Write the complete Python implementation now:"""
-        messages = [
-            {"role": "system", "content": SYSTEM_PROMPT},
-            {"role": "user", "content": user_message},
-        ]
-        # ── Step 3: Call LLM ──────────────────────────────────────────────
         try:
             response = client.chat.completions.create(
                 model=MODEL_NAME,
-                messages=messages,
                 max_tokens=1500,
-                temperature=0.1,  # Low temperature for consistent, focused code
             )
-            code = response.choices[0].message.content.strip()
-            # Strip markdown fences if model added them anyway
-            if code.startswith("```python"):
-                code = code[9:]
-            if code.startswith("```"):
-                code = code[3:]
-            if code.endswith("```"):
-                code = code[:-3]
-            code = code.strip()
-        except Exception as e:
-            print(f"  ⚠️  LLM call failed at step {step_num+1}: {e}")
-            break
-        # ── Step 4: Submit to environment ─────────────────────────────────
-        try:
             step_resp = requests.post(
                 f"{ENV_URL}/step",
                 json={
                     "session_id": sid,
                     "code": code,
-                    "filename": f"solution_step{step_num}.py",
                     "task_id": task_id,
                 },
-                timeout=60,  # Grading can take up to 60s (bandit + attacks)
             )
             step_resp.raise_for_status()
-        except requests.RequestException as e:
-            print(f"  ⚠️  /step failed: {e}")
-            break
-        result = step_resp.json()
-        reward = result["total_reward"]
-        scores_history.append(reward)
-        prev_feedback = result.get("feedback", {})
-        # Pretty print step result
-        scores = result.get("scores", {})
-        print(f"\n  Step {step_num+1} → reward={reward:.3f}")
-        print(f"    correctness={scores.get('correctness',0):.2f}  "
-              f"attack={scores.get('attack_resist',0):.2f}  "
-              f"static={scores.get('static_security',0):.2f}  "
-              f"consistency={scores.get('consistency',0):.2f}")
-        print(f"    summary: {prev_feedback.get('summary', '')}")
-        if result["done"]:
-            print(f"\n  ✅ Episode complete in {step_num+1} steps!")
             break
-        # Feed updated CodeGraph back for next step
-        episode["codegraph"] = result.get("codegraph", {})
-    if not scores_history:
-        scores_history = [0.0]
-    improved = len(scores_history) > 1 and scores_history[-1] > scores_history[0]
     return {
         "task": task_id,
         "difficulty": difficulty,
         "scores": scores_history,
-        "final_score": scores_history[-1],
-        "improved": improved,
         "steps": len(scores_history),
     }
-def main():
-    """Run one episode per difficulty and print aggregate results."""
-    print(f"\n{'='*60}")
-    print(f"  SecureCodeEnv — Baseline Inference")
-    print(f"  Model: {MODEL_NAME}")
-    print(f"  Env:   {ENV_URL}")
-    print(f"{'='*60}")
-    # Verify environment is up
     try:
         health = requests.get(f"{ENV_URL}/health", timeout=10)
         health.raise_for_status()
-        print(f"\n  ✅ Environment healthy: {health.json()}")
     except Exception as e:
-        print(f"\n  ❌ Environment not reachable at {ENV_URL}: {e}")
-        print("  Start the server: uvicorn app.main:app --host 0.0.0.0 --port 7860")
-        sys.exit(1)
     results = []
-    start = time.time()
-    for difficulty in ["easy", "medium", "hard"]:
-        r = run_episode(difficulty)
-        results.append(r)
-        # Small pause between episodes
         time.sleep(1)
-    elapsed = time.time() - start
-    # ── Final report ──────────────────────────────────────────────────────
-    print(f"\n{'='*60}")
-    print(f"  FINAL RESULTS  ({elapsed:.1f}s total)")
-    print(f"{'='*60}")
-    for r in results:
-        status = "✅" if r["final_score"] >= 0.7 else "⚠️ " if r["final_score"] >= 0.4 else "❌"
-        improved_str = "↑ improved" if r.get("improved") else "—"
-        print(f"  {status} {r['task']:45s}  {r['final_score']:.3f}  {improved_str}")
-    valid_scores = [r["final_score"] for r in results]
-    avg = sum(valid_scores) / len(valid_scores) if valid_scores else 0
-    print(f"\n  Average final score: {avg:.3f}")
-    print(f"  Scores: {[round(s, 3) for s in valid_scores]}")
-    # Write machine-readable results
-    output = {
-        "model": MODEL_NAME,
-        "env_url": ENV_URL,
-        "elapsed_seconds": round(elapsed, 1),
-        "results": results,
-        "average_score": round(avg, 4),
-    }
     with open("inference_results.json", "w") as f:
-        json.dump(output, f, indent=2)
-    print(f"\n  Results saved to inference_results.json")
-    return 0 if avg >= 0.4 else 1
 if __name__ == "__main__":

 """
 SecureCodeEnv - Baseline Inference Script
 Required by hackathon. Runs an LLM agent through the environment.
 """
 import os
 import json
 import sys
 import requests
 from openai import OpenAI
+from typing import Dict, List, Any, Optional
+# ── Constants & Configuration ──────────────────────────────────────────────
 API_BASE_URL = os.environ.get("API_BASE_URL", "https://api.openai.com/v1")
+MODEL_NAME = os.environ.get("MODEL_NAME", "gpt-4o-mini")
+HF_TOKEN = os.environ.get("HF_TOKEN", "")
+ENV_URL = os.environ.get("ENV_URL", "http://localhost:7860").rstrip("/")
+# Typed Exception for environment issues
+class EnvironmentConnectionError(Exception):
+    """Raised when the sandbox environment is unreachable or returns 5xx."""
+    pass
 if not HF_TOKEN:
     print("⚠️  HF_TOKEN not set. Some model endpoints may reject requests.", file=sys.stderr)
 client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN or "sk-placeholder")
 SYSTEM_PROMPT = """You are a senior Python security engineer.
 You write production-ready, secure Python code with no shortcuts.
 6. Use hmac.compare_digest for secret comparison (not ==).
 7. Validate all inputs — handle None, empty string, type errors.
 8. Add type hints and docstrings to every function.
+9. Use pathlib.Path.resolve() for file path validation."""
+def clean_code_output(raw_code: str) -> str:
+    """Removes markdown fences and surrounding whitespace safely."""
+    lines = raw_code.splitlines()
+    if not lines:
+        return ""
+    # Filter out markdown code fence markers
+    filtered = [line for line in lines if not line.strip().startswith("```")]
+    return "\n".join(filtered).strip()
+def run_episode(difficulty: str = "medium") -> Dict[str, Any]:
     """Run one full episode at the given difficulty and return results."""
+    print(f"\n{'='*60}\n  Episode: {difficulty.upper()}\n{'='*60}")
     try:
         reset_resp = requests.post(
             f"{ENV_URL}/reset",
             timeout=30,
         )
         reset_resp.raise_for_status()
+    except Exception as e:
         print(f"❌ /reset failed: {e}")
+        return {"task": f"reset_fail_{difficulty}", "scores": [0.0], "final_score": 0.0, "error": str(e)}
     episode = reset_resp.json()
     sid = episode["session_id"]
     task_id = episode["task_id"]
+    scores_history: List[float] = []
+    prev_feedback: Dict[str, Any] = {}
     for step_num in range(5):
         context = episode.get("codegraph", {})
         context_prompt = context.get("context_prompt", "")
+        context_str = context_prompt[:3000] if context_prompt else json.dumps(context)[:2000]
+        feedback_list = [f"{k}: {v}" for k, v in prev_feedback.items() if v]
+        feedback_str = "\n\nPREVIOUS FEEDBACK:\n" + "\n".join(feedback_list) if feedback_list else ""
+        user_message = f"Task: {episode['problem_statement']}\nTargets: {episode.get('cwe_targets', [])}\n{context_str}{feedback_str}\n\nImplementation:"
         try:
             response = client.chat.completions.create(
                 model=MODEL_NAME,
+                messages=[
+                    {"role": "system", "content": SYSTEM_PROMPT},
+                    {"role": "user", "content": user_message},
+                ],
                 max_tokens=1500,
+                temperature=0.1,
             )
+            raw_content = response.choices[0].message.content or ""
+            code = clean_code_output(raw_content)
+            if not code:
+                print(f"  ⚠️ Step {step_num}: LLM returned empty code.")
+                break
             step_resp = requests.post(
                 f"{ENV_URL}/step",
                 json={
                     "session_id": sid,
                     "code": code,
+                    "filename": f"solution_s{step_num}.py",
                     "task_id": task_id,
                 },
+                timeout=65,
             )
             step_resp.raise_for_status()
+            result = step_resp.json()
+            reward = result.get("total_reward", 0.0)
+            scores_history.append(reward)
+            prev_feedback = result.get("feedback", {})
+            print(f"  Step {step_num+1} → reward={reward:.3f}")
+            if result.get("done"):
+                break
+            episode["codegraph"] = result.get("codegraph", {})
+        except Exception as e:
+            print(f"  ⚠️ Error during step {step_num+1}: {e}")
             break
+    final_score = scores_history[-1] if scores_history else 0.0
     return {
         "task": task_id,
         "difficulty": difficulty,
         "scores": scores_history,
+        "final_score": final_score,
         "steps": len(scores_history),
     }
+def main() -> int:
+    """Main execution loop."""
+    print(f"Model: {MODEL_NAME} | Env: {ENV_URL}")
     try:
         health = requests.get(f"{ENV_URL}/health", timeout=10)
         health.raise_for_status()
     except Exception as e:
+        print(f"❌ Environment unreachable at {ENV_URL}. Ensure server is running.\nError: {e}")
+        return 1
     results = []
+    start_time = time.time()
+    for diff in ["easy", "medium", "hard"]:
+        try:
+            results.append(run_episode(diff))
+        except Exception as e:
+            print(f"Critical failure in {diff} episode: {e}")
         time.sleep(1)
+    elapsed = time.time() - start_time
+    avg_score = sum(r["final_score"] for r in results) / len(results) if results else 0.0
+    print(f"\n{'='*60}\n  FINAL AVERAGE: {avg_score:.3f} ({elapsed:.1f}s)\n{'='*60}")
     with open("inference_results.json", "w") as f:
+        json.dump({"results": results, "avg": avg_score}, f, indent=2)
+    # Return 0 to indicate the script finished its logic, regardless of score
+    # Unless there were absolutely no results (total failure)
+    return 0 if results else 1
 if __name__ == "__main__":