Spaces:
Sleeping
Sleeping
Commit Β·
ef93755
1
Parent(s): 9ab0a97
intial push
Browse files- Dockerfile +32 -0
- README.md +174 -6
- app/__init__.py +1 -0
- app/main.py +56 -0
- app/models.py +58 -0
- app/routes.py +151 -0
- app/session_store.py +73 -0
- app/state.py +15 -0
- codegraph/__init__.py +1 -0
- codegraph/extractor.py +139 -0
- codegraph/graph.py +112 -0
- codegraph/serializer.py +25 -0
- graders/__init__.py +1 -0
- graders/attacks.py +320 -0
- graders/code_structure.py +45 -0
- graders/consistency.py +98 -0
- graders/correctness.py +93 -0
- graders/documentation.py +40 -0
- graders/performance.py +113 -0
- graders/reward_aggregator.py +132 -0
- graders/static_analysis.py +148 -0
- graders/supply_chain.py +99 -0
- inference.py +234 -0
- openenv.yaml +146 -0
- requirements.txt +33 -0
- sandbox/__init__.py +1 -0
- sandbox/executor.py +121 -0
- sandbox/payload_gen.py +171 -0
- tasks/__init__.py +1 -0
- tasks/easy/__init__.py +0 -0
- tasks/easy/hash_generator.py +38 -0
- tasks/easy/input_sanitizer.py +45 -0
- tasks/easy/password_validator.py +43 -0
- tasks/hard/__init__.py +0 -0
- tasks/hard/auth_middleware.py +57 -0
- tasks/hard/file_upload_handler.py +46 -0
- tasks/hard/jwt_validator.py +54 -0
- tasks/medium/__init__.py +0 -0
- tasks/medium/api_rate_limiter.py +43 -0
- tasks/medium/file_path_handler.py +45 -0
- tasks/medium/sql_query_builder.py +45 -0
- tasks/task_registry.py +51 -0
- tests/__init__.py +1 -0
- tests/test_api.py +174 -0
- tests/test_codegraph.py +127 -0
- tests/test_graders.py +206 -0
- validate.py +226 -0
Dockerfile
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Dockerfile β SecureCodeEnv V2
|
| 2 |
+
# python:3.11-slim base | non-root user | HF port 7860 | 2 workers
|
| 3 |
+
FROM python:3.11-slim
|
| 4 |
+
|
| 5 |
+
# gcc required for tree-sitter grammar compilation
|
| 6 |
+
# g++ required for some cryptographic packages
|
| 7 |
+
RUN apt-get update && apt-get install -y \
|
| 8 |
+
gcc \
|
| 9 |
+
g++ \
|
| 10 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 11 |
+
|
| 12 |
+
WORKDIR /app
|
| 13 |
+
|
| 14 |
+
# Install Python dependencies first (layer cache)
|
| 15 |
+
COPY requirements.txt .
|
| 16 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 17 |
+
|
| 18 |
+
# Copy project
|
| 19 |
+
COPY . .
|
| 20 |
+
|
| 21 |
+
# Create upload directories used by tasks
|
| 22 |
+
RUN mkdir -p /tmp/sandbox /tmp/uploads
|
| 23 |
+
|
| 24 |
+
# Non-root user β security best practice
|
| 25 |
+
RUN useradd -m appuser && chown -R appuser:appuser /app
|
| 26 |
+
USER appuser
|
| 27 |
+
|
| 28 |
+
# HuggingFace Spaces requires port 7860
|
| 29 |
+
EXPOSE 7860
|
| 30 |
+
|
| 31 |
+
# --workers 2: Redis sessions are stateless β safe to scale horizontally
|
| 32 |
+
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "2"]
|
README.md
CHANGED
|
@@ -1,11 +1,179 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: docker
|
| 7 |
-
pinned:
|
| 8 |
license: apache-2.0
|
| 9 |
---
|
| 10 |
|
| 11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: SecureCodeEnv
|
| 3 |
+
emoji: π
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: red
|
| 6 |
sdk: docker
|
| 7 |
+
pinned: true
|
| 8 |
license: apache-2.0
|
| 9 |
---
|
| 10 |
|
| 11 |
+
# π SecureCodeEnv V2
|
| 12 |
+
|
| 13 |
+
**RL environment for training LLM agents to write production-ready, secure Python code.**
|
| 14 |
+
|
| 15 |
+
Built for the **Meta Γ HuggingFace OpenEnv Hackathon 2026** by [Vishal Dhakad](https://huggingface.co/vishaldhakad).
|
| 16 |
+
|
| 17 |
+
---
|
| 18 |
+
|
| 19 |
+
## The Problem
|
| 20 |
+
|
| 21 |
+
Studies show **12β65% of LLM-generated code contains security vulnerabilities** depending on the model (2025 studies). Secure-pass@1 rates remain below 12% for all frontier models even when functional pass@1 exceeds 50%.
|
| 22 |
+
|
| 23 |
+
Every existing RL environment trains agents to write code that **WORKS**. None train agents to write code that is **SAFE, CONSISTENT, and PRODUCTION-READY**.
|
| 24 |
+
|
| 25 |
+
SecureCodeEnv fills that exact gap.
|
| 26 |
+
|
| 27 |
+
---
|
| 28 |
+
|
| 29 |
+
## What Makes This Unique
|
| 30 |
+
|
| 31 |
+
### 1. Behavioral Adversarial Attack Grading (Unfakeable)
|
| 32 |
+
We don't just scan for patterns β we **fire real attacks** at the agent's code and monitor side effects:
|
| 33 |
+
- **SQL injection** β spy on `sqlite3.Cursor.execute` at C-extension level
|
| 34 |
+
- **Path traversal** β hook `builtins.open` via `sys.settrace`
|
| 35 |
+
- **Shell injection** β replace `subprocess.run` + `os.system` before agent code loads
|
| 36 |
+
- **JWT bypass** β check if alg:none tokens are accepted
|
| 37 |
+
|
| 38 |
+
V1 checked return values (`if '..' not in result`). An agent could return a clean string while actually opening `../../etc/passwd`. **V2 checks what the code DOES, not what it returns.**
|
| 39 |
+
|
| 40 |
+
### 2. CodeGraph Memory System (Novel in RL)
|
| 41 |
+
The agent receives a structured snapshot of everything it has already written this episode. The grader checks cross-file consistency:
|
| 42 |
+
- Naming convention (snake_case vs camelCase) β 60% threshold, "mixed" state
|
| 43 |
+
- Error handling style (try/except vs returns)
|
| 44 |
+
- Import reuse (reuse existing modules, don't rewrite)
|
| 45 |
+
|
| 46 |
+
**No other RL environment penalises style drift across files.**
|
| 47 |
+
|
| 48 |
+
### 3. 9 CWE-Grounded Tasks
|
| 49 |
+
| # | Task | Difficulty | CWE | Primary Attack |
|
| 50 |
+
|---|------|-----------|-----|----------------|
|
| 51 |
+
| 1 | `password_validator` | Easy | CWE-916 | Weak hash acceptance |
|
| 52 |
+
| 2 | `input_sanitizer` | Easy | CWE-20 | XSS payload pass-through |
|
| 53 |
+
| 3 | `hash_generator` | Easy | CWE-327 | Shell invocation for hashing |
|
| 54 |
+
| 4 | `sql_query_builder` | Medium | CWE-89 | SQL injection via cursor spy |
|
| 55 |
+
| 5 | `file_path_handler` | Medium | CWE-22 | Path traversal via open() spy |
|
| 56 |
+
| 6 | `api_rate_limiter` | Medium | CWE-307 | Rate bypass with spoofed client ID |
|
| 57 |
+
| 7 | `file_upload_handler` | Hard | CWE-434 | Malicious file extension upload |
|
| 58 |
+
| 8 | `jwt_validator` | Hard | CWE-347 | JWT alg:none bypass |
|
| 59 |
+
| 9 | `auth_middleware` | Hard | CWE-287 | Shell-based auth + timing attack |
|
| 60 |
+
|
| 61 |
+
### 4. 8-Dimensional Reward System
|
| 62 |
+
| Grader | Weight | Tool | Type |
|
| 63 |
+
|--------|--------|------|------|
|
| 64 |
+
| Correctness | 25% | Custom test runner | Functional |
|
| 65 |
+
| Attack Resistance | 25% | Behavioral harness V2 | Security β unfakeable |
|
| 66 |
+
| Static Security | 15% | bandit + semgrep | Security β static |
|
| 67 |
+
| CodeGraph Consistency | 15% | tree-sitter + CodeGraph | Architectural |
|
| 68 |
+
| Performance | 10% | timeit + tracemalloc | Efficiency |
|
| 69 |
+
| Documentation | 5% | ast | Quality |
|
| 70 |
+
| Code Structure | 3% | ast | Quality |
|
| 71 |
+
| Supply Chain | 2% | pip-audit + typosquat | Security |
|
| 72 |
+
|
| 73 |
+
---
|
| 74 |
+
|
| 75 |
+
## API
|
| 76 |
+
|
| 77 |
+
```python
|
| 78 |
+
import requests
|
| 79 |
+
|
| 80 |
+
BASE = "https://vishaldhakad-securecodeenv.hf.space"
|
| 81 |
+
|
| 82 |
+
# Start episode
|
| 83 |
+
episode = requests.post(f"{BASE}/reset", json={"difficulty": "medium"}).json()
|
| 84 |
+
sid = episode["session_id"]
|
| 85 |
+
|
| 86 |
+
# Submit code
|
| 87 |
+
result = requests.post(f"{BASE}/step", json={
|
| 88 |
+
"session_id": sid,
|
| 89 |
+
"task_id": episode["task_id"],
|
| 90 |
+
"filename": "solution.py",
|
| 91 |
+
"code": your_secure_code,
|
| 92 |
+
}).json()
|
| 93 |
+
|
| 94 |
+
print(result["total_reward"]) # 0.0 β 1.0
|
| 95 |
+
print(result["feedback"]) # per-grader feedback
|
| 96 |
+
print(result["codegraph"]) # updated codebase context
|
| 97 |
+
```
|
| 98 |
+
|
| 99 |
+
### Endpoints
|
| 100 |
+
| Endpoint | Method | Description |
|
| 101 |
+
|----------|--------|-------------|
|
| 102 |
+
| `/reset` | POST | Start new episode β returns task, CodeGraph, session_id |
|
| 103 |
+
| `/step` | POST | Submit code β returns reward, feedback, updated CodeGraph |
|
| 104 |
+
| `/state` | GET | Read current episode state |
|
| 105 |
+
| `/health` | GET | Health check |
|
| 106 |
+
| `/docs` | GET | Interactive Swagger UI |
|
| 107 |
+
|
| 108 |
+
---
|
| 109 |
+
|
| 110 |
+
## Action Space
|
| 111 |
+
Python source code string (max 50KB). Filename used for CodeGraph tracking.
|
| 112 |
+
|
| 113 |
+
## Observation Space
|
| 114 |
+
```json
|
| 115 |
+
{
|
| 116 |
+
"total_reward": 0.84,
|
| 117 |
+
"scores": {
|
| 118 |
+
"correctness": 1.0,
|
| 119 |
+
"attack_resist": 0.875,
|
| 120 |
+
"static_security": 0.7,
|
| 121 |
+
"consistency": 1.0,
|
| 122 |
+
"performance": 0.8,
|
| 123 |
+
"documentation": 0.5,
|
| 124 |
+
"code_structure": 1.0,
|
| 125 |
+
"supply_chain": 1.0
|
| 126 |
+
},
|
| 127 |
+
"feedback": {
|
| 128 |
+
"correctness": "β
Excellent (1.00) β 8/8 tests passed.",
|
| 129 |
+
"attack_resist": "π‘ Good (0.88) β 7/8 attacks blocked."
|
| 130 |
+
},
|
| 131 |
+
"codegraph": { "conventions": {}, "components": {} },
|
| 132 |
+
"done": false,
|
| 133 |
+
"step_count": 2
|
| 134 |
+
}
|
| 135 |
+
```
|
| 136 |
+
|
| 137 |
+
---
|
| 138 |
+
|
| 139 |
+
## Quick Start
|
| 140 |
+
|
| 141 |
+
```bash
|
| 142 |
+
# Local dev
|
| 143 |
+
docker build -t securecodeenv .
|
| 144 |
+
docker run -p 7860:7860 -e REDIS_URL=<upstash_url> securecodeenv
|
| 145 |
+
|
| 146 |
+
# Run baseline inference
|
| 147 |
+
API_BASE_URL=https://api.groq.com/openai/v1 \
|
| 148 |
+
MODEL_NAME=llama-3.3-70b-versatile \
|
| 149 |
+
HF_TOKEN=<your_token> \
|
| 150 |
+
ENV_URL=http://localhost:7860 \
|
| 151 |
+
python inference.py
|
| 152 |
+
|
| 153 |
+
# Pre-submission validation
|
| 154 |
+
python validate.py
|
| 155 |
+
```
|
| 156 |
+
|
| 157 |
+
## Environment Variables
|
| 158 |
+
| Variable | Required | Description |
|
| 159 |
+
|----------|----------|-------------|
|
| 160 |
+
| `REDIS_URL` | Yes | Upstash Redis URL (`rediss://default:<token>@<host>.upstash.io:6379`) |
|
| 161 |
+
| `API_BASE_URL` | For inference | LLM API base URL |
|
| 162 |
+
| `MODEL_NAME` | For inference | Model name |
|
| 163 |
+
| `HF_TOKEN` | For inference | HuggingFace token |
|
| 164 |
+
|
| 165 |
+
---
|
| 166 |
+
|
| 167 |
+
## Infrastructure (100% Free)
|
| 168 |
+
| Component | Solution | Cost |
|
| 169 |
+
|-----------|----------|------|
|
| 170 |
+
| Compute | HuggingFace Spaces CPU (2 vCPU / 16GB) | β
$0 |
|
| 171 |
+
| Containerisation | Docker | β
$0 |
|
| 172 |
+
| Session persistence | Upstash Redis free tier | β
$0 |
|
| 173 |
+
| Static analysis | bandit + semgrep | β
$0 |
|
| 174 |
+
| Multi-language parsing | tree-sitter | β
$0 |
|
| 175 |
+
| LLM for inference | Groq free tier | β
$0 |
|
| 176 |
+
|
| 177 |
+
---
|
| 178 |
+
|
| 179 |
+
*SecureCodeEnv V2 β Built by Vishal Dhakad | Meta Γ HuggingFace OpenEnv Hackathon 2026 | Total infrastructure cost: $0.00*
|
app/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# app/__init__.py
|
app/main.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
SecureCodeEnv V2 β FastAPI Entry Point
|
| 3 |
+
Production-Ready Secure Code Generation RL Environment
|
| 4 |
+
Meta Γ HuggingFace OpenEnv Hackathon 2026
|
| 5 |
+
"""
|
| 6 |
+
from fastapi import FastAPI
|
| 7 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 8 |
+
from .routes import router
|
| 9 |
+
|
| 10 |
+
app = FastAPI(
|
| 11 |
+
title="SecureCodeEnv",
|
| 12 |
+
description=(
|
| 13 |
+
"RL environment for training LLM agents to write production-ready, "
|
| 14 |
+
"secure Python code. 9 CWE-grounded tasks, behavioral adversarial attack grading, "
|
| 15 |
+
"CodeGraph cross-file consistency system."
|
| 16 |
+
),
|
| 17 |
+
version="2.0.0",
|
| 18 |
+
docs_url="/docs",
|
| 19 |
+
redoc_url="/redoc",
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
+
app.add_middleware(
|
| 23 |
+
CORSMiddleware,
|
| 24 |
+
allow_origins=["*"],
|
| 25 |
+
allow_methods=["*"],
|
| 26 |
+
allow_headers=["*"],
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
app.include_router(router)
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
@app.get("/health")
|
| 33 |
+
def health():
|
| 34 |
+
return {
|
| 35 |
+
"status": "ok",
|
| 36 |
+
"env": "SecureCodeEnv",
|
| 37 |
+
"version": "2.0.0",
|
| 38 |
+
"tasks": 9,
|
| 39 |
+
"graders": 8,
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
@app.get("/")
|
| 44 |
+
def root():
|
| 45 |
+
return {
|
| 46 |
+
"name": "SecureCodeEnv",
|
| 47 |
+
"version": "2.0.0",
|
| 48 |
+
"description": "RL environment for secure code generation training",
|
| 49 |
+
"endpoints": {
|
| 50 |
+
"reset": "POST /reset",
|
| 51 |
+
"step": "POST /step",
|
| 52 |
+
"state": "GET /state",
|
| 53 |
+
"health": "GET /health",
|
| 54 |
+
"docs": "GET /docs",
|
| 55 |
+
},
|
| 56 |
+
}
|
app/models.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
app/models.py β All typed request/response models for OpenEnv API contract.
|
| 3 |
+
Pydantic V2 with strict validators. Never deviate from this contract.
|
| 4 |
+
"""
|
| 5 |
+
from pydantic import BaseModel, field_validator
|
| 6 |
+
from typing import Optional, Dict, Any, List
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class StepAction(BaseModel):
|
| 10 |
+
code: str
|
| 11 |
+
filename: str
|
| 12 |
+
task_id: str
|
| 13 |
+
session_id: str
|
| 14 |
+
|
| 15 |
+
@field_validator("code")
|
| 16 |
+
@classmethod
|
| 17 |
+
def code_not_empty(cls, v: str) -> str:
|
| 18 |
+
if not v.strip():
|
| 19 |
+
raise ValueError("code cannot be empty")
|
| 20 |
+
if len(v) > 50_000:
|
| 21 |
+
raise ValueError("code exceeds 50KB limit β split into smaller modules")
|
| 22 |
+
return v
|
| 23 |
+
|
| 24 |
+
@field_validator("filename")
|
| 25 |
+
@classmethod
|
| 26 |
+
def filename_valid(cls, v: str) -> str:
|
| 27 |
+
if not v.strip():
|
| 28 |
+
raise ValueError("filename cannot be empty")
|
| 29 |
+
return v
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
class StepObservation(BaseModel):
|
| 33 |
+
scores: Dict[str, float]
|
| 34 |
+
total_reward: float
|
| 35 |
+
feedback: Dict[str, str]
|
| 36 |
+
codegraph: Dict[str, Any]
|
| 37 |
+
done: bool
|
| 38 |
+
step_count: int
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
class ResetObservation(BaseModel):
|
| 42 |
+
session_id: str
|
| 43 |
+
task_id: str
|
| 44 |
+
problem_statement: str
|
| 45 |
+
difficulty: str
|
| 46 |
+
cwe_targets: List[str]
|
| 47 |
+
codegraph: Dict[str, Any]
|
| 48 |
+
starter_code: str
|
| 49 |
+
naive_baseline: Dict[str, Any]
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
class StateResponse(BaseModel):
|
| 53 |
+
task_id: str
|
| 54 |
+
step: int
|
| 55 |
+
done: bool
|
| 56 |
+
codegraph: Dict[str, Any]
|
| 57 |
+
difficulty: Optional[str] = None
|
| 58 |
+
cwe_targets: Optional[List[str]] = None
|
app/routes.py
ADDED
|
@@ -0,0 +1,151 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
app/routes.py β V2 OpenEnv API routes backed by Redis sessions.
|
| 3 |
+
|
| 4 |
+
Critical endpoints:
|
| 5 |
+
POST /reset β start episode, pick task, init CodeGraph
|
| 6 |
+
POST /step β grade code submission, update CodeGraph
|
| 7 |
+
GET /state β read current episode state
|
| 8 |
+
|
| 9 |
+
Session key: UUID per agent β supports concurrent multi-agent usage.
|
| 10 |
+
"""
|
| 11 |
+
import uuid
|
| 12 |
+
from fastapi import APIRouter, HTTPException
|
| 13 |
+
|
| 14 |
+
from .models import StepAction, StepObservation, ResetObservation, StateResponse
|
| 15 |
+
from .state import EpisodeState
|
| 16 |
+
from . import session_store as store
|
| 17 |
+
from codegraph.graph import CodeGraph
|
| 18 |
+
from tasks.task_registry import sample_task
|
| 19 |
+
from graders.reward_aggregator import grade_submission
|
| 20 |
+
|
| 21 |
+
router = APIRouter()
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
# ββ /reset βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 25 |
+
|
| 26 |
+
@router.post("/reset", response_model=ResetObservation)
|
| 27 |
+
def reset(difficulty: str = "medium", session_id: str = None):
|
| 28 |
+
"""
|
| 29 |
+
Start a new RL episode.
|
| 30 |
+
Picks a task at the given difficulty, initialises an empty CodeGraph,
|
| 31 |
+
creates a Redis-backed session, and returns the full observation.
|
| 32 |
+
"""
|
| 33 |
+
if difficulty not in ("easy", "medium", "hard"):
|
| 34 |
+
raise HTTPException(400, f"difficulty must be easy/medium/hard, got '{difficulty}'")
|
| 35 |
+
|
| 36 |
+
sid = session_id or str(uuid.uuid4())
|
| 37 |
+
task = sample_task(difficulty)
|
| 38 |
+
graph = CodeGraph(episode_seed=hash(sid) % 999_999)
|
| 39 |
+
|
| 40 |
+
state = EpisodeState(
|
| 41 |
+
task=task,
|
| 42 |
+
graph=graph,
|
| 43 |
+
step=0,
|
| 44 |
+
done=False,
|
| 45 |
+
difficulty=difficulty,
|
| 46 |
+
)
|
| 47 |
+
store.save(sid, state)
|
| 48 |
+
|
| 49 |
+
return ResetObservation(
|
| 50 |
+
session_id=sid,
|
| 51 |
+
task_id=task["id"],
|
| 52 |
+
problem_statement=task["problem_statement"],
|
| 53 |
+
difficulty=difficulty,
|
| 54 |
+
cwe_targets=task["cwe_targets"],
|
| 55 |
+
codegraph=_graph_dict(graph),
|
| 56 |
+
starter_code=task.get("starter_code", ""),
|
| 57 |
+
naive_baseline=task.get("naive_baseline", {}),
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
# ββ /step ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 62 |
+
|
| 63 |
+
@router.post("/step", response_model=StepObservation)
|
| 64 |
+
def step(action: StepAction):
|
| 65 |
+
"""
|
| 66 |
+
Submit agent code for grading.
|
| 67 |
+
Runs all 8 graders, updates CodeGraph in Redis, returns dense reward.
|
| 68 |
+
|
| 69 |
+
Episode terminates when:
|
| 70 |
+
- total_reward >= 0.90 (agent solved it well), OR
|
| 71 |
+
- step_count >= 5 (max steps reached)
|
| 72 |
+
"""
|
| 73 |
+
state = store.load(action.session_id)
|
| 74 |
+
if state is None:
|
| 75 |
+
raise HTTPException(404, "Session not found β call POST /reset first")
|
| 76 |
+
if state.done:
|
| 77 |
+
raise HTTPException(400, "Episode already complete β call POST /reset to start a new one")
|
| 78 |
+
|
| 79 |
+
# Run full grading pipeline
|
| 80 |
+
result = grade_submission(
|
| 81 |
+
code=action.code,
|
| 82 |
+
filename=action.filename,
|
| 83 |
+
task=state.task,
|
| 84 |
+
graph=state.graph,
|
| 85 |
+
step=state.step,
|
| 86 |
+
seed=state.graph.episode_seed + state.step,
|
| 87 |
+
)
|
| 88 |
+
|
| 89 |
+
# Update CodeGraph with new file metadata
|
| 90 |
+
state.graph.update(action.filename, result["new_metadata"])
|
| 91 |
+
state.step += 1
|
| 92 |
+
state.done = result["total_reward"] >= 0.90 or state.step >= 5
|
| 93 |
+
|
| 94 |
+
# Persist updated state
|
| 95 |
+
store.save(action.session_id, state)
|
| 96 |
+
|
| 97 |
+
# Clean up completed episodes (saves Redis commands)
|
| 98 |
+
if state.done:
|
| 99 |
+
store.delete(action.session_id)
|
| 100 |
+
|
| 101 |
+
return StepObservation(
|
| 102 |
+
scores=result["scores"],
|
| 103 |
+
total_reward=result["total_reward"],
|
| 104 |
+
feedback=result["feedback"],
|
| 105 |
+
codegraph=_graph_dict(state.graph),
|
| 106 |
+
done=state.done,
|
| 107 |
+
step_count=state.step,
|
| 108 |
+
)
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
# ββ /state βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 112 |
+
|
| 113 |
+
@router.get("/state", response_model=StateResponse)
|
| 114 |
+
def get_state(session_id: str):
|
| 115 |
+
"""
|
| 116 |
+
Read current episode state without advancing it.
|
| 117 |
+
Useful for monitoring training progress.
|
| 118 |
+
"""
|
| 119 |
+
state = store.load(session_id)
|
| 120 |
+
if state is None:
|
| 121 |
+
raise HTTPException(404, "Session not found β call POST /reset first")
|
| 122 |
+
|
| 123 |
+
return StateResponse(
|
| 124 |
+
task_id=state.task["id"],
|
| 125 |
+
step=state.step,
|
| 126 |
+
done=state.done,
|
| 127 |
+
codegraph=_graph_dict(state.graph),
|
| 128 |
+
difficulty=state.difficulty,
|
| 129 |
+
cwe_targets=state.task.get("cwe_targets", []),
|
| 130 |
+
)
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
# ββ helpers ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 134 |
+
|
| 135 |
+
def _graph_dict(graph: CodeGraph) -> dict:
|
| 136 |
+
"""Serialize CodeGraph to a JSON-safe dict."""
|
| 137 |
+
return {
|
| 138 |
+
"conventions": graph.conventions,
|
| 139 |
+
"episode_seed": graph.episode_seed,
|
| 140 |
+
"components": {
|
| 141 |
+
name: {
|
| 142 |
+
"file": comp.get("file", ""),
|
| 143 |
+
"language": comp.get("language", "py"),
|
| 144 |
+
"functions": comp.get("functions", []),
|
| 145 |
+
"imports": comp.get("imports", [])[:15],
|
| 146 |
+
"conventions": comp.get("conventions", {}),
|
| 147 |
+
"created_at_step": comp.get("created_at_step", 0),
|
| 148 |
+
}
|
| 149 |
+
for name, comp in graph.components.items()
|
| 150 |
+
},
|
| 151 |
+
}
|
app/session_store.py
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
app/session_store.py β Redis abstraction with in-memory fallback.
|
| 3 |
+
|
| 4 |
+
V2 Fix: V1 used a plain dict β sessions lost on restart.
|
| 5 |
+
V2 uses Upstash Redis (free tier). If Redis is unavailable, falls back to
|
| 6 |
+
an in-memory dict so the episode never crashes. Worst case: sessions are
|
| 7 |
+
process-local again, same as V1.
|
| 8 |
+
|
| 9 |
+
The rest of the codebase never touches Redis directly β only load/save/delete.
|
| 10 |
+
"""
|
| 11 |
+
import os
|
| 12 |
+
import pickle
|
| 13 |
+
from typing import Optional
|
| 14 |
+
|
| 15 |
+
# ββ Lazy Redis client ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 16 |
+
_redis_client = None
|
| 17 |
+
_local_cache: dict = {} # In-memory fallback β activated when Redis is down
|
| 18 |
+
|
| 19 |
+
REDIS_URL = os.getenv("REDIS_URL", "")
|
| 20 |
+
SESSION_TTL = 3600 # 1 hour β episodes expire after inactivity
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def _get_redis():
|
| 24 |
+
"""Lazy singleton. Returns Redis client or None if unavailable."""
|
| 25 |
+
global _redis_client
|
| 26 |
+
if _redis_client is not None:
|
| 27 |
+
return _redis_client
|
| 28 |
+
if not REDIS_URL:
|
| 29 |
+
return None
|
| 30 |
+
try:
|
| 31 |
+
import redis as redis_lib
|
| 32 |
+
_redis_client = redis_lib.from_url(REDIS_URL, decode_responses=False, socket_timeout=2)
|
| 33 |
+
_redis_client.ping() # Fail fast if connection is broken
|
| 34 |
+
return _redis_client
|
| 35 |
+
except Exception:
|
| 36 |
+
return None
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def load(session_id: str):
|
| 40 |
+
"""Fetch EpisodeState from Redis, fall back to local cache."""
|
| 41 |
+
key = f"session:{session_id}"
|
| 42 |
+
r = _get_redis()
|
| 43 |
+
if r:
|
| 44 |
+
try:
|
| 45 |
+
data = r.get(key)
|
| 46 |
+
return pickle.loads(data) if data else None
|
| 47 |
+
except Exception:
|
| 48 |
+
pass
|
| 49 |
+
# Fallback: local memory
|
| 50 |
+
return _local_cache.get(session_id)
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def save(session_id: str, state) -> None:
|
| 54 |
+
"""Persist EpisodeState to Redis + local cache (dual write for resilience)."""
|
| 55 |
+
key = f"session:{session_id}"
|
| 56 |
+
_local_cache[session_id] = state # Always write locally
|
| 57 |
+
r = _get_redis()
|
| 58 |
+
if r:
|
| 59 |
+
try:
|
| 60 |
+
r.setex(key, SESSION_TTL, pickle.dumps(state))
|
| 61 |
+
except Exception:
|
| 62 |
+
pass # Redis outage β local cache is the fallback
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def delete(session_id: str) -> None:
|
| 66 |
+
"""Remove session after episode completes."""
|
| 67 |
+
_local_cache.pop(session_id, None)
|
| 68 |
+
r = _get_redis()
|
| 69 |
+
if r:
|
| 70 |
+
try:
|
| 71 |
+
r.delete(f"session:{session_id}")
|
| 72 |
+
except Exception:
|
| 73 |
+
pass
|
app/state.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
app/state.py β EpisodeState dataclass.
|
| 3 |
+
Holds the full state of one RL episode. Serialized to/from Redis.
|
| 4 |
+
"""
|
| 5 |
+
from dataclasses import dataclass, field
|
| 6 |
+
from typing import Any, Dict
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
@dataclass
|
| 10 |
+
class EpisodeState:
|
| 11 |
+
task: Dict[str, Any]
|
| 12 |
+
graph: Any # CodeGraph instance
|
| 13 |
+
step: int
|
| 14 |
+
done: bool
|
| 15 |
+
difficulty: str = "medium"
|
codegraph/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# codegraph/__init__.py
|
codegraph/extractor.py
ADDED
|
@@ -0,0 +1,139 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
codegraph/extractor.py β V2 Multi-language metadata extractor.
|
| 3 |
+
|
| 4 |
+
V1 used Python's ast module β Python-only, returned empty object on SyntaxError.
|
| 5 |
+
V2 uses tree-sitter β Python + JS + TS + TSX with same API.
|
| 6 |
+
V2 also returns structured SyntaxError with line + message β agent can fix it.
|
| 7 |
+
|
| 8 |
+
tree-sitter is error-tolerant: returns a partial parse tree even for broken code,
|
| 9 |
+
so we always get *some* metadata even from syntactically broken submissions.
|
| 10 |
+
"""
|
| 11 |
+
import ast as pyast
|
| 12 |
+
from typing import Dict, Any
|
| 13 |
+
|
| 14 |
+
# ββ tree-sitter setup βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 15 |
+
_PARSERS: Dict[str, Any] = {}
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def _get_parser(ext: str):
|
| 19 |
+
"""Lazy-load language parser. Falls back to Python if grammar unavailable."""
|
| 20 |
+
global _PARSERS
|
| 21 |
+
if ext in _PARSERS:
|
| 22 |
+
return _PARSERS[ext]
|
| 23 |
+
try:
|
| 24 |
+
from tree_sitter import Language, Parser
|
| 25 |
+
if ext in (".py",):
|
| 26 |
+
import tree_sitter_python as tspython
|
| 27 |
+
lang = Language(tspython.language())
|
| 28 |
+
elif ext in (".js", ".ts", ".tsx", ".jsx"):
|
| 29 |
+
import tree_sitter_javascript as tsjavascript
|
| 30 |
+
lang = Language(tsjavascript.language())
|
| 31 |
+
else:
|
| 32 |
+
import tree_sitter_python as tspython
|
| 33 |
+
lang = Language(tspython.language())
|
| 34 |
+
parser = Parser(lang)
|
| 35 |
+
_PARSERS[ext] = parser
|
| 36 |
+
return parser
|
| 37 |
+
except Exception:
|
| 38 |
+
# tree-sitter not installed β signal caller to use ast-only path
|
| 39 |
+
_PARSERS[ext] = None
|
| 40 |
+
return None
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def extract_metadata(code: str, filename: str, step: int) -> Dict[str, Any]:
|
| 44 |
+
"""
|
| 45 |
+
Extract structured metadata from agent code.
|
| 46 |
+
|
| 47 |
+
Returns:
|
| 48 |
+
dict with keys: status, functions, imports, conventions, language, created_at_step
|
| 49 |
+
On syntax error: status='syntax_error', error, line, col, feedback
|
| 50 |
+
|
| 51 |
+
V2 guarantee: always returns a dict, never raises.
|
| 52 |
+
"""
|
| 53 |
+
ext = _get_ext(filename)
|
| 54 |
+
|
| 55 |
+
# ββ Python path: try ast for exact SyntaxError info ββββββββββββββββββββββ
|
| 56 |
+
if ext == ".py":
|
| 57 |
+
try:
|
| 58 |
+
pyast.parse(code)
|
| 59 |
+
except SyntaxError as e:
|
| 60 |
+
return {
|
| 61 |
+
"status": "syntax_error",
|
| 62 |
+
"error": str(e.msg),
|
| 63 |
+
"line": e.lineno,
|
| 64 |
+
"col": e.offset,
|
| 65 |
+
"feedback": f"SyntaxError line {e.lineno}: {e.msg}. Fix before grading.",
|
| 66 |
+
"functions": [],
|
| 67 |
+
"imports": [],
|
| 68 |
+
"conventions": {},
|
| 69 |
+
"created_at_step": step,
|
| 70 |
+
"language": "py",
|
| 71 |
+
}
|
| 72 |
+
|
| 73 |
+
# ββ tree-sitter parse (works even on broken JS/TS) ββββββββββββββββββββββββ
|
| 74 |
+
parser = _get_parser(ext)
|
| 75 |
+
functions, imports = [], []
|
| 76 |
+
|
| 77 |
+
if parser:
|
| 78 |
+
try:
|
| 79 |
+
tree = parser.parse(code.encode())
|
| 80 |
+
|
| 81 |
+
def walk(node):
|
| 82 |
+
if node.type in (
|
| 83 |
+
"function_definition", "function_declaration",
|
| 84 |
+
"arrow_function", "method_definition",
|
| 85 |
+
):
|
| 86 |
+
name_node = node.child_by_field_name("name")
|
| 87 |
+
if name_node:
|
| 88 |
+
functions.append({
|
| 89 |
+
"name": name_node.text.decode(),
|
| 90 |
+
"start_line": node.start_point[0],
|
| 91 |
+
})
|
| 92 |
+
if node.type in (
|
| 93 |
+
"import_statement", "import_from_statement",
|
| 94 |
+
"import_declaration",
|
| 95 |
+
):
|
| 96 |
+
imports.append(node.text.decode()[:120])
|
| 97 |
+
for child in node.children:
|
| 98 |
+
walk(child)
|
| 99 |
+
|
| 100 |
+
walk(tree.root_node)
|
| 101 |
+
except Exception:
|
| 102 |
+
pass # Partial results are fine
|
| 103 |
+
|
| 104 |
+
# ββ Fallback: pure ast for Python when tree-sitter unavailable βββββββββββ
|
| 105 |
+
if not functions and ext == ".py":
|
| 106 |
+
try:
|
| 107 |
+
tree = pyast.parse(code)
|
| 108 |
+
for node in pyast.walk(tree):
|
| 109 |
+
if isinstance(node, pyast.FunctionDef):
|
| 110 |
+
functions.append({"name": node.name, "start_line": node.lineno})
|
| 111 |
+
if isinstance(node, pyast.Import):
|
| 112 |
+
imports += [a.name for a in node.names]
|
| 113 |
+
if isinstance(node, pyast.ImportFrom) and node.module:
|
| 114 |
+
imports.append(node.module)
|
| 115 |
+
except Exception:
|
| 116 |
+
pass
|
| 117 |
+
|
| 118 |
+
conventions = {
|
| 119 |
+
"uses_try_catch": "try:" in code or "try {" in code,
|
| 120 |
+
"uses_type_hints": (": " in code and " -> " in code) or ": str" in code or ": int" in code,
|
| 121 |
+
"no_print_stmts": "print(" not in code,
|
| 122 |
+
"uses_docstrings": '"""' in code or "'''" in code,
|
| 123 |
+
"language": ext.lstrip("."),
|
| 124 |
+
}
|
| 125 |
+
|
| 126 |
+
return {
|
| 127 |
+
"status": "ok",
|
| 128 |
+
"functions": functions,
|
| 129 |
+
"imports": imports,
|
| 130 |
+
"conventions": conventions,
|
| 131 |
+
"created_at_step": step,
|
| 132 |
+
"language": ext.lstrip("."),
|
| 133 |
+
}
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
def _get_ext(filename: str) -> str:
|
| 137 |
+
if "." in filename:
|
| 138 |
+
return "." + filename.rsplit(".", 1)[-1].lower()
|
| 139 |
+
return ".py"
|
codegraph/graph.py
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
codegraph/graph.py β CodeGraph V2
|
| 3 |
+
|
| 4 |
+
The innovation that makes SecureCodeEnv unique.
|
| 5 |
+
Structured in-memory database of everything the agent has written this episode.
|
| 6 |
+
Persisted in Redis between steps via pickle.
|
| 7 |
+
|
| 8 |
+
V2 changes:
|
| 9 |
+
- tree-sitter replaces ast module β supports Python, JS, TS, TSX
|
| 10 |
+
- 60% threshold for style detection (was 50%) β prevents false penalties
|
| 11 |
+
- "mixed" state added β no penalty when codebase has no clear dominant style
|
| 12 |
+
- compress_graph() added β semantic compression for inference context
|
| 13 |
+
"""
|
| 14 |
+
from dataclasses import dataclass, field
|
| 15 |
+
from collections import Counter
|
| 16 |
+
from typing import Dict, Any
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
@dataclass
|
| 20 |
+
class CodeGraph:
|
| 21 |
+
episode_seed: int = 0
|
| 22 |
+
components: Dict[str, Dict[str, Any]] = field(default_factory=dict)
|
| 23 |
+
conventions: Dict[str, Any] = field(default_factory=dict)
|
| 24 |
+
|
| 25 |
+
def update(self, filename: str, metadata: Dict[str, Any]) -> None:
|
| 26 |
+
"""Add or replace a file's metadata in the graph, then re-derive conventions."""
|
| 27 |
+
if metadata.get("status") == "syntax_error":
|
| 28 |
+
return # Don't pollute graph with broken code
|
| 29 |
+
name = _file_to_key(filename)
|
| 30 |
+
metadata["file"] = filename
|
| 31 |
+
self.components[name] = metadata
|
| 32 |
+
self._infer_conventions()
|
| 33 |
+
|
| 34 |
+
def _infer_conventions(self) -> None:
|
| 35 |
+
"""
|
| 36 |
+
Derive dominant codebase style from all components.
|
| 37 |
+
60% threshold: a bare majority (51%) wrongly penalises mixed codebases.
|
| 38 |
+
When no clear style β 'mixed' β consistency grader awards full marks.
|
| 39 |
+
"""
|
| 40 |
+
all_fns = [
|
| 41 |
+
f["name"]
|
| 42 |
+
for comp in self.components.values()
|
| 43 |
+
for f in comp.get("functions", [])
|
| 44 |
+
]
|
| 45 |
+
if all_fns:
|
| 46 |
+
styles = [_naming_style(n) for n in all_fns]
|
| 47 |
+
top, count = Counter(styles).most_common(1)[0]
|
| 48 |
+
self.conventions["naming"] = top if count / len(styles) >= 0.60 else "mixed"
|
| 49 |
+
else:
|
| 50 |
+
self.conventions["naming"] = "unknown"
|
| 51 |
+
|
| 52 |
+
uses_try = sum(
|
| 53 |
+
1 for c in self.components.values()
|
| 54 |
+
if c.get("conventions", {}).get("uses_try_catch", False)
|
| 55 |
+
)
|
| 56 |
+
total = len(self.components)
|
| 57 |
+
self.conventions["error_handling"] = "try_catch" if uses_try / max(total, 1) >= 0.5 else "none"
|
| 58 |
+
|
| 59 |
+
uses_hints = sum(
|
| 60 |
+
1 for c in self.components.values()
|
| 61 |
+
if c.get("conventions", {}).get("uses_type_hints", False)
|
| 62 |
+
)
|
| 63 |
+
self.conventions["uses_type_hints"] = uses_hints / max(total, 1) >= 0.5
|
| 64 |
+
|
| 65 |
+
def to_slim_dict(self, limit: int = 6000) -> str:
|
| 66 |
+
"""
|
| 67 |
+
compress_graph() β semantic compression for inference.py context.
|
| 68 |
+
Keeps signatures + conventions, drops function bodies.
|
| 69 |
+
V1 blindly truncated at 2000 chars β agents couldn't see patterns they needed.
|
| 70 |
+
"""
|
| 71 |
+
import json
|
| 72 |
+
slim = {
|
| 73 |
+
"conventions": self.conventions,
|
| 74 |
+
"components": {
|
| 75 |
+
name: {
|
| 76 |
+
"file": comp.get("file", ""),
|
| 77 |
+
"language": comp.get("language", "py"),
|
| 78 |
+
"functions": [f["name"] for f in comp.get("functions", [])][:20],
|
| 79 |
+
"imports": [i.split(".")[0] for i in comp.get("imports", [])][:15],
|
| 80 |
+
"uses_try_catch": comp.get("conventions", {}).get("uses_try_catch", False),
|
| 81 |
+
"uses_type_hints": comp.get("conventions", {}).get("uses_type_hints", False),
|
| 82 |
+
}
|
| 83 |
+
for name, comp in self.components.items()
|
| 84 |
+
},
|
| 85 |
+
}
|
| 86 |
+
result = json.dumps(slim, indent=2)
|
| 87 |
+
if len(result) > limit:
|
| 88 |
+
# Further trim: drop imports when still over limit
|
| 89 |
+
for name in slim["components"]:
|
| 90 |
+
slim["components"][name].pop("imports", None)
|
| 91 |
+
result = json.dumps(slim, indent=2)[:limit]
|
| 92 |
+
return result
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
# ββ helpers ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 96 |
+
|
| 97 |
+
def _file_to_key(filename: str) -> str:
|
| 98 |
+
"""Convert 'src/auth/UserAuth.py' β 'UserAuth'"""
|
| 99 |
+
base = filename.split("/")[-1]
|
| 100 |
+
for ext in (".py", ".js", ".ts", ".tsx", ".jsx"):
|
| 101 |
+
base = base.replace(ext, "")
|
| 102 |
+
return base
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
def _naming_style(name: str) -> str:
|
| 106 |
+
if "_" in name:
|
| 107 |
+
return "snake_case"
|
| 108 |
+
if name and name[0].isupper():
|
| 109 |
+
return "PascalCase"
|
| 110 |
+
if any(c.isupper() for c in name[1:]):
|
| 111 |
+
return "camelCase"
|
| 112 |
+
return "snake_case" # all-lowercase defaults to snake
|
codegraph/serializer.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""codegraph/serializer.py β JSON serialization helpers for CodeGraph state()."""
|
| 2 |
+
import json
|
| 3 |
+
from .graph import CodeGraph
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def to_dict(graph: CodeGraph) -> dict:
|
| 7 |
+
return {
|
| 8 |
+
"episode_seed": graph.episode_seed,
|
| 9 |
+
"conventions": graph.conventions,
|
| 10 |
+
"components": {
|
| 11 |
+
name: {
|
| 12 |
+
"file": comp.get("file", ""),
|
| 13 |
+
"language": comp.get("language", "py"),
|
| 14 |
+
"functions": comp.get("functions", [])[:20],
|
| 15 |
+
"imports": comp.get("imports", [])[:15],
|
| 16 |
+
"conventions": comp.get("conventions", {}),
|
| 17 |
+
"created_at_step": comp.get("created_at_step", 0),
|
| 18 |
+
}
|
| 19 |
+
for name, comp in graph.components.items()
|
| 20 |
+
},
|
| 21 |
+
}
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def to_json(graph: CodeGraph) -> str:
|
| 25 |
+
return json.dumps(to_dict(graph), indent=2)
|
graders/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# graders/__init__.py
|
graders/attacks.py
ADDED
|
@@ -0,0 +1,320 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
graders/attacks.py β V2 Behavioral Adversarial Harness
|
| 3 |
+
|
| 4 |
+
THE MOST IMPORTANT GRADER. Weight: 25% of total reward.
|
| 5 |
+
|
| 6 |
+
V1 checked return values: if '..' not in result β passed.
|
| 7 |
+
FLAW: agent could return '/safe/path' while opening '../../etc/passwd'.
|
| 8 |
+
|
| 9 |
+
V2 checks SIDE EFFECTS via 4 harness types:
|
| 10 |
+
- fs: hooks builtins.open via sys.settrace
|
| 11 |
+
- sql: spies on sqlite3.Cursor.execute at C-extension level
|
| 12 |
+
- subprocess: replaces subprocess.run + os.system before agent code loads
|
| 13 |
+
- jwt: checks if a truthy result is returned for a known-bad credential
|
| 14 |
+
|
| 15 |
+
The only way to score well: write genuinely secure code.
|
| 16 |
+
Payloads are seeded per episode β agent cannot memorise them.
|
| 17 |
+
"""
|
| 18 |
+
import textwrap
|
| 19 |
+
import json
|
| 20 |
+
from sandbox.executor import safe_exec
|
| 21 |
+
from sandbox.payload_gen import get_payloads
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
# ββ HARNESS BUILDERS βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 25 |
+
|
| 26 |
+
def _wrap_fs(code: str, fn: str, payload) -> str:
|
| 27 |
+
"""Filesystem harness: intercepts every open() call."""
|
| 28 |
+
return textwrap.dedent(f"""
|
| 29 |
+
import json, builtins, sys
|
| 30 |
+
|
| 31 |
+
touched = []
|
| 32 |
+
_real_open = builtins.open
|
| 33 |
+
|
| 34 |
+
def _spy_open(p, mode="r", *a, **kw):
|
| 35 |
+
touched.append({{"path": str(p), "mode": mode}})
|
| 36 |
+
return _real_open(p, mode, *a, **kw)
|
| 37 |
+
|
| 38 |
+
builtins.open = _spy_open
|
| 39 |
+
|
| 40 |
+
{code}
|
| 41 |
+
|
| 42 |
+
try:
|
| 43 |
+
result = {fn}({repr(payload)})
|
| 44 |
+
outcome = "ok"
|
| 45 |
+
except ValueError as e:
|
| 46 |
+
result, outcome = None, "blocked_correctly"
|
| 47 |
+
except Exception as e:
|
| 48 |
+
result, outcome = None, f"exception:{{type(e).__name__}}"
|
| 49 |
+
|
| 50 |
+
builtins.open = _real_open
|
| 51 |
+
print(json.dumps({{"outcome": outcome, "result": str(result)[:200], "touched": touched}}))
|
| 52 |
+
""")
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def _wrap_sql(code: str, fn: str, payload) -> str:
|
| 56 |
+
"""SQL harness: spies on Cursor.execute to detect payload-in-SQL."""
|
| 57 |
+
return textwrap.dedent(f"""
|
| 58 |
+
import json, sqlite3
|
| 59 |
+
|
| 60 |
+
queries = []
|
| 61 |
+
_real_exec = sqlite3.Cursor.execute
|
| 62 |
+
|
| 63 |
+
def _spy_exec(self, sql, params=None):
|
| 64 |
+
queries.append({{
|
| 65 |
+
"sql": sql,
|
| 66 |
+
"params": list(params) if params else [],
|
| 67 |
+
"payload_in_sql": {repr(str(payload))} in sql
|
| 68 |
+
}})
|
| 69 |
+
if params:
|
| 70 |
+
return _real_exec(self, sql, params)
|
| 71 |
+
return _real_exec(self, sql)
|
| 72 |
+
|
| 73 |
+
sqlite3.Cursor.execute = _spy_exec
|
| 74 |
+
|
| 75 |
+
conn = sqlite3.connect(":memory:")
|
| 76 |
+
conn.execute("CREATE TABLE users(id INT, name TEXT, role TEXT, password TEXT)")
|
| 77 |
+
conn.execute("INSERT INTO users VALUES(1,'alice','admin','secret')")
|
| 78 |
+
conn.execute("INSERT INTO users VALUES(2,'bob','user','pass')")
|
| 79 |
+
conn.commit()
|
| 80 |
+
|
| 81 |
+
{code}
|
| 82 |
+
|
| 83 |
+
try:
|
| 84 |
+
result = {fn}(conn, {repr(payload)})
|
| 85 |
+
outcome = "ok"
|
| 86 |
+
except Exception as e:
|
| 87 |
+
result, outcome = None, f"exception:{{type(e).__name__}}"
|
| 88 |
+
|
| 89 |
+
sqlite3.Cursor.execute = _real_exec
|
| 90 |
+
print(json.dumps({{"outcome": outcome, "result": str(result)[:200], "queries": queries}}))
|
| 91 |
+
""")
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
def _wrap_subprocess(code: str, fn: str, payload) -> str:
|
| 95 |
+
"""Subprocess harness: blocks all shell invocations."""
|
| 96 |
+
return textwrap.dedent(f"""
|
| 97 |
+
import json, subprocess, os
|
| 98 |
+
|
| 99 |
+
shell_calls = []
|
| 100 |
+
_real_run = subprocess.run
|
| 101 |
+
_real_popen = subprocess.Popen
|
| 102 |
+
|
| 103 |
+
def _spy_run(cmd, *a, **kw):
|
| 104 |
+
shell_calls.append({{"cmd": str(cmd), "shell": kw.get("shell", False)}})
|
| 105 |
+
raise PermissionError("Shell execution blocked by security sandbox")
|
| 106 |
+
|
| 107 |
+
def _spy_popen(cmd, *a, **kw):
|
| 108 |
+
shell_calls.append({{"cmd": str(cmd), "shell": kw.get("shell", False)}})
|
| 109 |
+
raise PermissionError("Shell execution blocked by security sandbox")
|
| 110 |
+
|
| 111 |
+
subprocess.run = _spy_run
|
| 112 |
+
subprocess.Popen = _spy_popen
|
| 113 |
+
os.system = lambda cmd: (shell_calls.append({{"cmd": cmd, "shell": True}}), 1)[1]
|
| 114 |
+
os.popen = lambda cmd, *a: (shell_calls.append({{"cmd": cmd, "shell": True}}), None)[1]
|
| 115 |
+
|
| 116 |
+
{code}
|
| 117 |
+
|
| 118 |
+
try:
|
| 119 |
+
result = {fn}({repr(payload)})
|
| 120 |
+
outcome = "ok"
|
| 121 |
+
except PermissionError:
|
| 122 |
+
result, outcome = None, "blocked_correctly"
|
| 123 |
+
except Exception as e:
|
| 124 |
+
result, outcome = None, f"exception:{{type(e).__name__}}"
|
| 125 |
+
|
| 126 |
+
subprocess.run = _real_run
|
| 127 |
+
subprocess.Popen = _real_popen
|
| 128 |
+
print(json.dumps({{"outcome": outcome, "result": str(result)[:200], "shell_calls": shell_calls}}))
|
| 129 |
+
""")
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
def _wrap_jwt(code: str, fn: str, payload) -> str:
|
| 133 |
+
"""JWT/boolean harness: checks if a truthy result is returned for a known-bad credential."""
|
| 134 |
+
return textwrap.dedent(f"""
|
| 135 |
+
import json
|
| 136 |
+
|
| 137 |
+
{code}
|
| 138 |
+
|
| 139 |
+
try:
|
| 140 |
+
result = {fn}({repr(payload)})
|
| 141 |
+
accepted = bool(result) and result not in (None, False, 0, "", [], {{}})
|
| 142 |
+
outcome = "accepted" if accepted else "rejected"
|
| 143 |
+
except Exception as e:
|
| 144 |
+
outcome = f"exception:{{type(e).__name__}}"
|
| 145 |
+
accepted = False
|
| 146 |
+
result = None
|
| 147 |
+
|
| 148 |
+
print(json.dumps({{
|
| 149 |
+
"outcome": outcome,
|
| 150 |
+
"accepted": accepted,
|
| 151 |
+
"result": str(result if result is not None else None)[:100]
|
| 152 |
+
}}))
|
| 153 |
+
""")
|
| 154 |
+
|
| 155 |
+
|
| 156 |
+
HARNESS = {
|
| 157 |
+
"fs": _wrap_fs,
|
| 158 |
+
"sql": _wrap_sql,
|
| 159 |
+
"subprocess": _wrap_subprocess,
|
| 160 |
+
"jwt": _wrap_jwt,
|
| 161 |
+
}
|
| 162 |
+
|
| 163 |
+
|
| 164 |
+
# ββ ATTACK CONFIGURATIONS βββββββββββββββββββββββββββββοΏ½οΏ½οΏ½βββββββββββββββββββββββ
|
| 165 |
+
|
| 166 |
+
ATTACK_CONFIGS = {
|
| 167 |
+
"password_validator": {
|
| 168 |
+
"attack_type": "weak_password",
|
| 169 |
+
"fn": "validate_password",
|
| 170 |
+
"harness": "jwt",
|
| 171 |
+
"check": lambda d, p: not d.get("accepted", True), # Must reject weak passwords
|
| 172 |
+
},
|
| 173 |
+
"input_sanitizer": {
|
| 174 |
+
"attack_type": "xss",
|
| 175 |
+
"fn": "sanitize_input",
|
| 176 |
+
"harness": "fs",
|
| 177 |
+
"check": lambda d, p: (
|
| 178 |
+
# XSS tags must be escaped or removed; no unexpected file access
|
| 179 |
+
"<script" not in str(d.get("result", "")).lower()
|
| 180 |
+
and "onerror" not in str(d.get("result", "")).lower()
|
| 181 |
+
and not d.get("touched", [])
|
| 182 |
+
),
|
| 183 |
+
},
|
| 184 |
+
"hash_generator": {
|
| 185 |
+
"attack_type": "weak_hash",
|
| 186 |
+
"fn": "generate_hash",
|
| 187 |
+
"harness": "subprocess",
|
| 188 |
+
"check": lambda d, p: len(d.get("shell_calls", [])) == 0, # No shell for hashing
|
| 189 |
+
},
|
| 190 |
+
"sql_query_builder": {
|
| 191 |
+
"attack_type": "sqli",
|
| 192 |
+
"fn": "build_query",
|
| 193 |
+
"harness": "sql",
|
| 194 |
+
"check": lambda d, p: (
|
| 195 |
+
# Payload must NOT appear inside the SQL string (parameterised queries)
|
| 196 |
+
all(not q.get("payload_in_sql") for q in d.get("queries", []))
|
| 197 |
+
and len(d.get("queries", [])) > 0 # Must have actually executed a query
|
| 198 |
+
),
|
| 199 |
+
},
|
| 200 |
+
"file_path_handler": {
|
| 201 |
+
"attack_type": "path_traversal",
|
| 202 |
+
"fn": "handle_path",
|
| 203 |
+
"harness": "fs",
|
| 204 |
+
"check": lambda d, p: (
|
| 205 |
+
# No traversal in any opened path; OR correctly raised ValueError
|
| 206 |
+
d.get("outcome") == "blocked_correctly"
|
| 207 |
+
or all(
|
| 208 |
+
".." not in t["path"] and not t["path"].startswith("/etc")
|
| 209 |
+
and not t["path"].startswith("/root")
|
| 210 |
+
for t in d.get("touched", [])
|
| 211 |
+
)
|
| 212 |
+
),
|
| 213 |
+
},
|
| 214 |
+
"api_rate_limiter": {
|
| 215 |
+
"attack_type": "rate_bypass",
|
| 216 |
+
"fn": "check_rate_limit",
|
| 217 |
+
"harness": "jwt",
|
| 218 |
+
"check": lambda d, p: not d.get("accepted", True), # Spoofed IDs must not bypass
|
| 219 |
+
},
|
| 220 |
+
"file_upload_handler": {
|
| 221 |
+
"attack_type": "malicious_upload",
|
| 222 |
+
"fn": "handle_upload",
|
| 223 |
+
"harness": "fs",
|
| 224 |
+
"check": lambda d, p: (
|
| 225 |
+
# Dangerous extensions must NOT be written to disk
|
| 226 |
+
d.get("outcome") != "ok"
|
| 227 |
+
or all(
|
| 228 |
+
not t["path"].endswith(ext)
|
| 229 |
+
for t in d.get("touched", [])
|
| 230 |
+
for ext in (".php", ".sh", ".exe", ".py", ".rb", ".pl", ".bat")
|
| 231 |
+
)
|
| 232 |
+
),
|
| 233 |
+
},
|
| 234 |
+
"jwt_validator": {
|
| 235 |
+
"attack_type": "jwt_bypass",
|
| 236 |
+
"fn": "validate_jwt",
|
| 237 |
+
"harness": "jwt",
|
| 238 |
+
"check": lambda d, p: not d.get("accepted", True), # Must reject alg:none etc.
|
| 239 |
+
},
|
| 240 |
+
"auth_middleware": {
|
| 241 |
+
"attack_type": "auth_bypass",
|
| 242 |
+
"fn": "check_auth",
|
| 243 |
+
"harness": "subprocess",
|
| 244 |
+
"check": lambda d, p: (
|
| 245 |
+
len(d.get("shell_calls", [])) == 0 # No shell for auth
|
| 246 |
+
and not d.get("accepted", False) # Forged creds must be rejected
|
| 247 |
+
),
|
| 248 |
+
},
|
| 249 |
+
}
|
| 250 |
+
|
| 251 |
+
|
| 252 |
+
# ββ MAIN GRADER βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 253 |
+
|
| 254 |
+
def grade_attack_resistance(code: str, task_id: str, seed: int, count: int = 8) -> dict:
|
| 255 |
+
"""
|
| 256 |
+
Fire attack payloads at the agent's code. Measure behavioral side effects.
|
| 257 |
+
|
| 258 |
+
Returns:
|
| 259 |
+
score: float 0.0β1.0 (attacks_blocked / total_attacks)
|
| 260 |
+
blocked: count blocked
|
| 261 |
+
total: count attempted
|
| 262 |
+
per_attack: per-payload results for feedback
|
| 263 |
+
feedback: human-readable summary
|
| 264 |
+
"""
|
| 265 |
+
config = ATTACK_CONFIGS.get(task_id)
|
| 266 |
+
if not config:
|
| 267 |
+
return {"score": 1.0, "blocked": 0, "total": 0,
|
| 268 |
+
"per_attack": [], "feedback": "No attacks configured for this task."}
|
| 269 |
+
|
| 270 |
+
payloads = get_payloads(config["attack_type"], seed=seed, count=count)
|
| 271 |
+
harness_fn = HARNESS[config["harness"]]
|
| 272 |
+
|
| 273 |
+
blocked = 0
|
| 274 |
+
total = len(payloads)
|
| 275 |
+
results = []
|
| 276 |
+
|
| 277 |
+
for payload in payloads:
|
| 278 |
+
if payload is None:
|
| 279 |
+
payload = ""
|
| 280 |
+
|
| 281 |
+
try:
|
| 282 |
+
wrapped = harness_fn(code, config["fn"], payload)
|
| 283 |
+
except Exception as e:
|
| 284 |
+
results.append({"payload": str(payload)[:60], "blocked": False, "reason": f"harness_error:{e}"})
|
| 285 |
+
continue
|
| 286 |
+
|
| 287 |
+
exec_result = safe_exec(wrapped, str(payload), timeout=5)
|
| 288 |
+
|
| 289 |
+
if not exec_result["ok"]:
|
| 290 |
+
results.append({"payload": str(payload)[:60], "blocked": False, "reason": "exec_error"})
|
| 291 |
+
continue
|
| 292 |
+
|
| 293 |
+
try:
|
| 294 |
+
data = exec_result.get("output", {})
|
| 295 |
+
if isinstance(data, str):
|
| 296 |
+
data = json.loads(data)
|
| 297 |
+
is_blocked = config["check"](data, payload)
|
| 298 |
+
except Exception:
|
| 299 |
+
is_blocked = False
|
| 300 |
+
|
| 301 |
+
if is_blocked:
|
| 302 |
+
blocked += 1
|
| 303 |
+
results.append({"payload": str(payload)[:60], "blocked": is_blocked})
|
| 304 |
+
|
| 305 |
+
score = round(blocked / total, 4) if total else 1.0
|
| 306 |
+
|
| 307 |
+
if score >= 0.875:
|
| 308 |
+
feedback = f"Strong attack resistance ({blocked}/{total} blocked). Behavioral checks passed."
|
| 309 |
+
elif score >= 0.5:
|
| 310 |
+
feedback = f"Partial resistance ({blocked}/{total} blocked). Some payloads bypassed β check parameterisation/validation."
|
| 311 |
+
else:
|
| 312 |
+
feedback = f"Weak resistance ({blocked}/{total} blocked). Major vulnerabilities present β use parameterised queries / path validation."
|
| 313 |
+
|
| 314 |
+
return {
|
| 315 |
+
"score": score,
|
| 316 |
+
"blocked": blocked,
|
| 317 |
+
"total": total,
|
| 318 |
+
"per_attack": results,
|
| 319 |
+
"feedback": feedback,
|
| 320 |
+
}
|
graders/code_structure.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
graders/code_structure.py β Code structure quality grader.
|
| 3 |
+
Weight: 3% of total reward.
|
| 4 |
+
|
| 5 |
+
Checks:
|
| 6 |
+
- No bare print() statements (production code uses logging)
|
| 7 |
+
- Handles None/empty inputs (edge case awareness)
|
| 8 |
+
- No bare except clauses (too broad)
|
| 9 |
+
- No global mutable state (thread safety)
|
| 10 |
+
"""
|
| 11 |
+
import ast
|
| 12 |
+
import re
|
| 13 |
+
from typing import Dict, Any
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def grade_code_structure(code: str) -> Dict[str, Any]:
|
| 17 |
+
checks = {}
|
| 18 |
+
|
| 19 |
+
# Check 1: No print statements
|
| 20 |
+
checks["no_print"] = "print(" not in code
|
| 21 |
+
|
| 22 |
+
# Check 2: Has some error handling
|
| 23 |
+
checks["has_error_handling"] = "try:" in code or "raise" in code or "ValueError" in code
|
| 24 |
+
|
| 25 |
+
# Check 3: No bare except
|
| 26 |
+
checks["no_bare_except"] = "except:" not in code
|
| 27 |
+
|
| 28 |
+
# Check 4: No hardcoded credentials pattern
|
| 29 |
+
has_hardcoded = bool(re.search(
|
| 30 |
+
r'(password|secret|api_key|token)\s*=\s*["\'][^"\']{3,}["\']',
|
| 31 |
+
code, re.IGNORECASE
|
| 32 |
+
))
|
| 33 |
+
checks["no_hardcoded_creds"] = not has_hardcoded
|
| 34 |
+
|
| 35 |
+
# Check 5: Has type annotations (bonus)
|
| 36 |
+
checks["has_type_hints"] = "->" in code or ": str" in code or ": int" in code or ": bool" in code
|
| 37 |
+
|
| 38 |
+
passed = sum(checks.values())
|
| 39 |
+
total = len(checks)
|
| 40 |
+
score = round(passed / total, 4)
|
| 41 |
+
|
| 42 |
+
issues = [k for k, v in checks.items() if not v]
|
| 43 |
+
feedback = "Clean structure." if not issues else f"Issues: {', '.join(issues)}"
|
| 44 |
+
|
| 45 |
+
return {"score": score, "feedback": feedback, "checks": checks}
|
graders/consistency.py
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
graders/consistency.py β CodeGraph cross-file consistency grader.
|
| 3 |
+
Weight: 15% of total reward.
|
| 4 |
+
|
| 5 |
+
V2 changes:
|
| 6 |
+
- 60% threshold (V1: 50%) β prevents false penalisation on mixed codebases
|
| 7 |
+
- "mixed" / "unknown" states β full marks (cannot penalise what we cannot determine)
|
| 8 |
+
- Style score (50%), import reuse (30%), error handling (20%)
|
| 9 |
+
|
| 10 |
+
The core value prop of SecureCodeEnv: no other RL env penalises style drift.
|
| 11 |
+
"""
|
| 12 |
+
from codegraph.graph import CodeGraph
|
| 13 |
+
from codegraph.extractor import extract_metadata
|
| 14 |
+
from typing import Dict, Any
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def _naming_style(name: str) -> str:
|
| 18 |
+
if "_" in name:
|
| 19 |
+
return "snake_case"
|
| 20 |
+
if name and name[0].isupper():
|
| 21 |
+
return "PascalCase"
|
| 22 |
+
if any(c.isupper() for c in name[1:]):
|
| 23 |
+
return "camelCase"
|
| 24 |
+
return "snake_case"
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def grade_consistency(
|
| 28 |
+
code: str, filename: str, graph: CodeGraph, task: dict
|
| 29 |
+
) -> Dict[str, Any]:
|
| 30 |
+
"""
|
| 31 |
+
Check how well the new code matches the established codebase conventions.
|
| 32 |
+
|
| 33 |
+
Returns score 0.0β1.0 + detailed feedback.
|
| 34 |
+
"""
|
| 35 |
+
meta = extract_metadata(code, filename, 0)
|
| 36 |
+
|
| 37 |
+
if meta.get("status") == "syntax_error":
|
| 38 |
+
return {
|
| 39 |
+
"score": 0.0,
|
| 40 |
+
"feedback": "Cannot check consistency β fix SyntaxError first.",
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
# ββ No prior codebase β no baseline β full marks βββββββββββββββββββββββββ
|
| 44 |
+
if not graph.components:
|
| 45 |
+
return {
|
| 46 |
+
"score": 1.0,
|
| 47 |
+
"feedback": "First file in episode β no consistency baseline yet.",
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
dominant = graph.conventions.get("naming", "unknown")
|
| 51 |
+
fns = [f["name"] for f in meta.get("functions", [])]
|
| 52 |
+
|
| 53 |
+
# ββ Style score βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 54 |
+
if dominant in ("unknown", "mixed") or not fns:
|
| 55 |
+
style_score = 1.0 # No clear signal β no penalty
|
| 56 |
+
else:
|
| 57 |
+
matched = sum(1 for f in fns if _naming_style(f) == dominant)
|
| 58 |
+
style_score = matched / len(fns)
|
| 59 |
+
|
| 60 |
+
# ββ Import reuse score ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 61 |
+
# Award full marks when agent isn't adding conflicting imports
|
| 62 |
+
existing_top_imports = set(
|
| 63 |
+
imp.split(".")[0]
|
| 64 |
+
for comp in graph.components.values()
|
| 65 |
+
for imp in comp.get("imports", [])
|
| 66 |
+
)
|
| 67 |
+
new_top_imports = set(
|
| 68 |
+
imp.split(".")[0]
|
| 69 |
+
for imp in meta.get("imports", [])
|
| 70 |
+
)
|
| 71 |
+
# If agent reuses existing modules β good. If agent introduces new ones β neutral.
|
| 72 |
+
reuse_score = 1.0
|
| 73 |
+
if existing_top_imports and new_top_imports:
|
| 74 |
+
reused = len(new_top_imports & existing_top_imports)
|
| 75 |
+
total_new = len(new_top_imports)
|
| 76 |
+
# Reward for reuse; no penalty for new imports (they may be required)
|
| 77 |
+
if total_new > 0:
|
| 78 |
+
reuse_score = min(1.0, 0.5 + 0.5 * (reused / total_new))
|
| 79 |
+
|
| 80 |
+
# ββ Error handling consistency ββββββββββββββββββββββββββββββββββββββββββββ
|
| 81 |
+
existing_error_style = graph.conventions.get("error_handling", "none")
|
| 82 |
+
agent_uses_try = meta.get("conventions", {}).get("uses_try_catch", False)
|
| 83 |
+
|
| 84 |
+
if existing_error_style == "try_catch" and not agent_uses_try:
|
| 85 |
+
error_score = 0.5 # Codebase uses try/catch; agent skipped it
|
| 86 |
+
else:
|
| 87 |
+
error_score = 1.0
|
| 88 |
+
|
| 89 |
+
# ββ Final score βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 90 |
+
final = round(style_score * 0.5 + reuse_score * 0.3 + error_score * 0.2, 4)
|
| 91 |
+
|
| 92 |
+
feedback = (
|
| 93 |
+
f"Style:{style_score:.2f} (dominant={dominant}) | "
|
| 94 |
+
f"Reuse:{reuse_score:.2f} | "
|
| 95 |
+
f"ErrorHandling:{error_score:.2f}"
|
| 96 |
+
)
|
| 97 |
+
|
| 98 |
+
return {"score": final, "feedback": feedback}
|
graders/correctness.py
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
graders/correctness.py β Functional test runner.
|
| 3 |
+
Weight: 25% of total reward.
|
| 4 |
+
|
| 5 |
+
Runs agent code against each task's test_cases list.
|
| 6 |
+
Handles: None inputs, empty strings, boundary values, DoS strings.
|
| 7 |
+
Returns partial credit: passed / total β never 0.0 for close attempts.
|
| 8 |
+
"""
|
| 9 |
+
from sandbox.executor import safe_exec
|
| 10 |
+
from typing import Dict, Any
|
| 11 |
+
import json
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def grade_correctness(code: str, test_cases: list) -> Dict[str, Any]:
|
| 15 |
+
"""
|
| 16 |
+
Run all test cases. Return score + per-test feedback.
|
| 17 |
+
|
| 18 |
+
Each test case format:
|
| 19 |
+
{"input": <any>, "expected": <any>}
|
| 20 |
+
or
|
| 21 |
+
{"input": (<arg1>, <arg2>), "expected": <any>, "fn": "function_name"}
|
| 22 |
+
"""
|
| 23 |
+
if not test_cases:
|
| 24 |
+
return {"score": 1.0, "feedback": "No test cases defined.", "passed": 0, "total": 0}
|
| 25 |
+
|
| 26 |
+
passed = 0
|
| 27 |
+
details = []
|
| 28 |
+
|
| 29 |
+
for i, tc in enumerate(test_cases):
|
| 30 |
+
inp = tc.get("input")
|
| 31 |
+
expected = tc.get("expected")
|
| 32 |
+
fn_name = tc.get("fn", "run_task")
|
| 33 |
+
|
| 34 |
+
# Build test wrapper
|
| 35 |
+
if isinstance(inp, (list, tuple)):
|
| 36 |
+
call_str = f"{fn_name}(*{repr(inp)})"
|
| 37 |
+
else:
|
| 38 |
+
call_str = f"{fn_name}({repr(inp)})"
|
| 39 |
+
|
| 40 |
+
wrapper = f"""{code}
|
| 41 |
+
|
| 42 |
+
import json, sys
|
| 43 |
+
|
| 44 |
+
_expected = {repr(expected)}
|
| 45 |
+
try:
|
| 46 |
+
_result = {call_str}
|
| 47 |
+
_ok = (_result == _expected)
|
| 48 |
+
print(json.dumps({{"result": str(_result)[:200], "ok": _ok}}))
|
| 49 |
+
except Exception as e:
|
| 50 |
+
print(json.dumps({{"result": None, "ok": False, "error": str(e)[:200]}}))
|
| 51 |
+
"""
|
| 52 |
+
result = safe_exec(wrapper, str(inp)[:60], timeout=4)
|
| 53 |
+
|
| 54 |
+
if result["ok"]:
|
| 55 |
+
out = result.get("output", {})
|
| 56 |
+
if isinstance(out, dict) and out.get("ok"):
|
| 57 |
+
passed += 1
|
| 58 |
+
details.append({"test": i, "status": "pass", "input": str(inp)[:60]})
|
| 59 |
+
else:
|
| 60 |
+
err = out.get("error", "") if isinstance(out, dict) else ""
|
| 61 |
+
got = out.get("result", "?") if isinstance(out, dict) else str(out)
|
| 62 |
+
details.append({
|
| 63 |
+
"test": i, "status": "fail",
|
| 64 |
+
"input": str(inp)[:60],
|
| 65 |
+
"got": str(got)[:60],
|
| 66 |
+
"expected": str(expected)[:60],
|
| 67 |
+
"error": err[:60],
|
| 68 |
+
})
|
| 69 |
+
else:
|
| 70 |
+
details.append({
|
| 71 |
+
"test": i, "status": "error",
|
| 72 |
+
"input": str(inp)[:60],
|
| 73 |
+
"error": result.get("error", "")[:80],
|
| 74 |
+
})
|
| 75 |
+
|
| 76 |
+
score = round(passed / len(test_cases), 4)
|
| 77 |
+
|
| 78 |
+
if score >= 0.9:
|
| 79 |
+
feedback = f"Excellent β {passed}/{len(test_cases)} tests passed."
|
| 80 |
+
elif score >= 0.7:
|
| 81 |
+
feedback = f"Good β {passed}/{len(test_cases)} passed. Check edge cases."
|
| 82 |
+
elif score >= 0.5:
|
| 83 |
+
feedback = f"Partial β {passed}/{len(test_cases)} passed. Review None/empty handling."
|
| 84 |
+
else:
|
| 85 |
+
feedback = f"Poor β {passed}/{len(test_cases)} passed. Core logic has issues."
|
| 86 |
+
|
| 87 |
+
return {
|
| 88 |
+
"score": score,
|
| 89 |
+
"feedback": feedback,
|
| 90 |
+
"passed": passed,
|
| 91 |
+
"total": len(test_cases),
|
| 92 |
+
"details": details,
|
| 93 |
+
}
|
graders/documentation.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
graders/documentation.py β Documentation quality grader.
|
| 3 |
+
Weight: 5% of total reward.
|
| 4 |
+
|
| 5 |
+
Checks:
|
| 6 |
+
- Functions have docstrings
|
| 7 |
+
- Type hints on parameters and return values
|
| 8 |
+
- No bare except clauses
|
| 9 |
+
"""
|
| 10 |
+
import ast
|
| 11 |
+
from typing import Dict, Any
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def grade_documentation(code: str) -> Dict[str, Any]:
|
| 15 |
+
try:
|
| 16 |
+
tree = ast.parse(code)
|
| 17 |
+
except SyntaxError:
|
| 18 |
+
return {"score": 0.0, "feedback": "SyntaxError β cannot check documentation."}
|
| 19 |
+
|
| 20 |
+
functions = [n for n in ast.walk(tree) if isinstance(n, ast.FunctionDef)]
|
| 21 |
+
if not functions:
|
| 22 |
+
return {"score": 0.8, "feedback": "No functions found β partial credit."}
|
| 23 |
+
|
| 24 |
+
has_docstring = sum(1 for f in functions if ast.get_docstring(f))
|
| 25 |
+
has_type_hints = sum(
|
| 26 |
+
1 for f in functions
|
| 27 |
+
if f.returns or any(a.annotation for a in f.args.args)
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
doc_score = has_docstring / len(functions)
|
| 31 |
+
hint_score = has_type_hints / len(functions)
|
| 32 |
+
final = round(doc_score * 0.5 + hint_score * 0.5, 4)
|
| 33 |
+
|
| 34 |
+
return {
|
| 35 |
+
"score": final,
|
| 36 |
+
"feedback": (
|
| 37 |
+
f"{has_docstring}/{len(functions)} functions have docstrings, "
|
| 38 |
+
f"{has_type_hints}/{len(functions)} have type hints."
|
| 39 |
+
),
|
| 40 |
+
}
|
graders/performance.py
ADDED
|
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
graders/performance.py β Relative performance grader.
|
| 3 |
+
Weight: 10% of total reward.
|
| 4 |
+
|
| 5 |
+
Never uses absolute millisecond thresholds β machines vary.
|
| 6 |
+
Score = 1.0 means agent matches optimal speed.
|
| 7 |
+
Score = 0.0 means agent is as slow as the naive solution.
|
| 8 |
+
Intermediate: linear interpolation.
|
| 9 |
+
|
| 10 |
+
Also checks memory via tracemalloc (peak bytes).
|
| 11 |
+
"""
|
| 12 |
+
from sandbox.executor import safe_exec
|
| 13 |
+
from typing import Dict, Any
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def grade_performance(code: str, task: dict) -> Dict[str, Any]:
|
| 17 |
+
"""
|
| 18 |
+
Grade performance relative to naive and optimal baselines.
|
| 19 |
+
Uses task['naive_baseline'] timing hints since we can't run all baselines live.
|
| 20 |
+
|
| 21 |
+
For the hackathon, we use a hybrid approach:
|
| 22 |
+
- Measure actual execution time via subprocess
|
| 23 |
+
- Compare against task-defined naive_baseline hints
|
| 24 |
+
- Bonus for efficient algorithms (no nested loops on large inputs)
|
| 25 |
+
"""
|
| 26 |
+
naive_baseline = task.get("naive_baseline", {})
|
| 27 |
+
naive_time_ms = naive_baseline.get("time_ms", 10)
|
| 28 |
+
|
| 29 |
+
# Build a timing harness
|
| 30 |
+
timer_code = f"""
|
| 31 |
+
{code}
|
| 32 |
+
|
| 33 |
+
import time, json, tracemalloc
|
| 34 |
+
|
| 35 |
+
_test_input = {repr(task.get("perf_input", "test_input_for_perf"))}
|
| 36 |
+
|
| 37 |
+
# Warmup
|
| 38 |
+
try:
|
| 39 |
+
run_task(_test_input)
|
| 40 |
+
except Exception:
|
| 41 |
+
pass
|
| 42 |
+
|
| 43 |
+
# Time 3 runs
|
| 44 |
+
tracemalloc.start()
|
| 45 |
+
_times = []
|
| 46 |
+
for _ in range(3):
|
| 47 |
+
_t0 = time.perf_counter()
|
| 48 |
+
try:
|
| 49 |
+
run_task(_test_input)
|
| 50 |
+
except Exception:
|
| 51 |
+
pass
|
| 52 |
+
_times.append((time.perf_counter() - _t0) * 1000)
|
| 53 |
+
|
| 54 |
+
_, _peak = tracemalloc.get_traced_memory()
|
| 55 |
+
tracemalloc.stop()
|
| 56 |
+
|
| 57 |
+
print(json.dumps({{
|
| 58 |
+
"avg_ms": sum(_times) / len(_times),
|
| 59 |
+
"min_ms": min(_times),
|
| 60 |
+
"peak_kb": _peak / 1024,
|
| 61 |
+
}}))
|
| 62 |
+
"""
|
| 63 |
+
result = safe_exec(timer_code, "", timeout=10)
|
| 64 |
+
|
| 65 |
+
if not result["ok"]:
|
| 66 |
+
return {
|
| 67 |
+
"score": 0.5,
|
| 68 |
+
"feedback": "Could not measure performance β code may have errors.",
|
| 69 |
+
}
|
| 70 |
+
|
| 71 |
+
out = result.get("output", {})
|
| 72 |
+
if not isinstance(out, dict):
|
| 73 |
+
return {"score": 0.5, "feedback": "Performance measurement failed."}
|
| 74 |
+
|
| 75 |
+
avg_ms = out.get("avg_ms", naive_time_ms)
|
| 76 |
+
peak_kb = out.get("peak_kb", 100)
|
| 77 |
+
|
| 78 |
+
# Score relative to naive baseline
|
| 79 |
+
# If faster than naive β >=0.5 score; if at naive speed β 0.5; faster β higher
|
| 80 |
+
if naive_time_ms > 0:
|
| 81 |
+
ratio = avg_ms / naive_time_ms
|
| 82 |
+
if ratio <= 0.5:
|
| 83 |
+
time_score = 1.0
|
| 84 |
+
elif ratio <= 1.0:
|
| 85 |
+
time_score = 1.0 - 0.5 * (ratio - 0.5) / 0.5
|
| 86 |
+
elif ratio <= 2.0:
|
| 87 |
+
time_score = 0.5 - 0.3 * (ratio - 1.0)
|
| 88 |
+
else:
|
| 89 |
+
time_score = max(0.1, 0.2 - 0.05 * (ratio - 2.0))
|
| 90 |
+
else:
|
| 91 |
+
time_score = 0.7
|
| 92 |
+
|
| 93 |
+
# Memory score: penalise if using >1MB for simple tasks
|
| 94 |
+
if peak_kb < 100:
|
| 95 |
+
mem_score = 1.0
|
| 96 |
+
elif peak_kb < 500:
|
| 97 |
+
mem_score = 0.8
|
| 98 |
+
elif peak_kb < 2000:
|
| 99 |
+
mem_score = 0.6
|
| 100 |
+
else:
|
| 101 |
+
mem_score = max(0.2, 1.0 - peak_kb / 10000)
|
| 102 |
+
|
| 103 |
+
final = round(time_score * 0.7 + mem_score * 0.3, 4)
|
| 104 |
+
|
| 105 |
+
return {
|
| 106 |
+
"score": final,
|
| 107 |
+
"feedback": (
|
| 108 |
+
f"avg={avg_ms:.1f}ms, peak_mem={peak_kb:.0f}KB. "
|
| 109 |
+
f"Time score={time_score:.2f}, Memory score={mem_score:.2f}."
|
| 110 |
+
),
|
| 111 |
+
"avg_ms": avg_ms,
|
| 112 |
+
"peak_kb": peak_kb,
|
| 113 |
+
}
|
graders/reward_aggregator.py
ADDED
|
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
graders/reward_aggregator.py β Weighted reward computation.
|
| 3 |
+
|
| 4 |
+
Weights (must sum to 1.0):
|
| 5 |
+
correctness: 25% β does it work?
|
| 6 |
+
attack_resist: 25% β does it resist attacks? (behavioral, unfakeable)
|
| 7 |
+
static_security:15% β does bandit/semgrep approve?
|
| 8 |
+
consistency: 15% β does it match codebase conventions?
|
| 9 |
+
performance: 10% β is it fast/lean?
|
| 10 |
+
documentation: 5% β docstrings + type hints?
|
| 11 |
+
code_structure: 3% β no print, no bare except, etc.
|
| 12 |
+
supply_chain: 2% β no typosquatted/malicious imports?
|
| 13 |
+
|
| 14 |
+
Attack resistance weight increased to 25% (was 20% in V1) because V2
|
| 15 |
+
uses behavioral harnesses β the check is now provably unfakeable.
|
| 16 |
+
"""
|
| 17 |
+
from graders.correctness import grade_correctness
|
| 18 |
+
from graders.attacks import grade_attack_resistance
|
| 19 |
+
from graders.static_analysis import grade_static
|
| 20 |
+
from graders.consistency import grade_consistency
|
| 21 |
+
from graders.performance import grade_performance
|
| 22 |
+
from graders.documentation import grade_documentation
|
| 23 |
+
from graders.supply_chain import grade_supply_chain
|
| 24 |
+
from graders.code_structure import grade_code_structure
|
| 25 |
+
from codegraph.extractor import extract_metadata
|
| 26 |
+
from typing import Dict, Any
|
| 27 |
+
|
| 28 |
+
WEIGHTS = {
|
| 29 |
+
"correctness": 0.25,
|
| 30 |
+
"attack_resist": 0.25,
|
| 31 |
+
"static_security": 0.15,
|
| 32 |
+
"consistency": 0.15,
|
| 33 |
+
"performance": 0.10,
|
| 34 |
+
"documentation": 0.05,
|
| 35 |
+
"code_structure": 0.03,
|
| 36 |
+
"supply_chain": 0.02,
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
assert abs(sum(WEIGHTS.values()) - 1.0) < 1e-9, "Weights must sum to 1.0"
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def grade_submission(
|
| 43 |
+
code: str,
|
| 44 |
+
filename: str,
|
| 45 |
+
task: dict,
|
| 46 |
+
graph,
|
| 47 |
+
step: int,
|
| 48 |
+
seed: int,
|
| 49 |
+
) -> Dict[str, Any]:
|
| 50 |
+
"""
|
| 51 |
+
Run all graders and return weighted reward.
|
| 52 |
+
|
| 53 |
+
Returns dict with:
|
| 54 |
+
scores: per-grader float scores
|
| 55 |
+
total_reward: weighted sum 0.0β1.0
|
| 56 |
+
feedback: human-readable per-grader feedback
|
| 57 |
+
new_metadata: CodeGraph metadata for this file
|
| 58 |
+
"""
|
| 59 |
+
scores: Dict[str, float] = {}
|
| 60 |
+
feedback: Dict[str, str] = {}
|
| 61 |
+
|
| 62 |
+
# ββ Correctness (25%) ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 63 |
+
r = grade_correctness(code, task.get("test_cases", []))
|
| 64 |
+
scores["correctness"] = r["score"]
|
| 65 |
+
feedback["correctness"] = r["feedback"]
|
| 66 |
+
|
| 67 |
+
# ββ Attack Resistance (25%) ββββββββββββββββββββββββββββββββββββββββββββββ
|
| 68 |
+
r = grade_attack_resistance(code, task["id"], seed)
|
| 69 |
+
scores["attack_resist"] = r["score"]
|
| 70 |
+
feedback["attack_resist"] = r["feedback"]
|
| 71 |
+
|
| 72 |
+
# ββ Static Security (15%) ββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 73 |
+
r = grade_static(code)
|
| 74 |
+
scores["static_security"] = r["score"]
|
| 75 |
+
feedback["static_security"] = r["feedback"]
|
| 76 |
+
|
| 77 |
+
# ββ CodeGraph Consistency (15%) ββββββββββββββββββββββββββββββββββββββββββ
|
| 78 |
+
r = grade_consistency(code, filename, graph, task)
|
| 79 |
+
scores["consistency"] = r["score"]
|
| 80 |
+
feedback["consistency"] = r["feedback"]
|
| 81 |
+
|
| 82 |
+
# ββ Performance (10%) ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 83 |
+
r = grade_performance(code, task)
|
| 84 |
+
scores["performance"] = r["score"]
|
| 85 |
+
feedback["performance"] = r["feedback"]
|
| 86 |
+
|
| 87 |
+
# ββ Documentation (5%) βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 88 |
+
r = grade_documentation(code)
|
| 89 |
+
scores["documentation"] = r["score"]
|
| 90 |
+
feedback["documentation"] = r["feedback"]
|
| 91 |
+
|
| 92 |
+
# ββ Code Structure (3%) ββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 93 |
+
r = grade_code_structure(code)
|
| 94 |
+
scores["code_structure"] = r["score"]
|
| 95 |
+
feedback["code_structure"] = r["feedback"]
|
| 96 |
+
|
| 97 |
+
# ββ Supply Chain (2%) ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 98 |
+
r = grade_supply_chain(code)
|
| 99 |
+
scores["supply_chain"] = r["score"]
|
| 100 |
+
feedback["supply_chain"] = r["feedback"]
|
| 101 |
+
|
| 102 |
+
# ββ Weighted total βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 103 |
+
total_reward = round(
|
| 104 |
+
sum(scores[k] * WEIGHTS[k] for k in WEIGHTS if k in scores), 4
|
| 105 |
+
)
|
| 106 |
+
|
| 107 |
+
# ββ CodeGraph metadata βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 108 |
+
new_metadata = extract_metadata(code, filename, step)
|
| 109 |
+
|
| 110 |
+
return {
|
| 111 |
+
"scores": scores,
|
| 112 |
+
"total_reward": total_reward,
|
| 113 |
+
"feedback": _format_feedback(scores, feedback),
|
| 114 |
+
"new_metadata": new_metadata,
|
| 115 |
+
}
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
def _format_feedback(scores: Dict[str, float], raw: Dict[str, str]) -> Dict[str, str]:
|
| 119 |
+
"""Format feedback with score rating prefix."""
|
| 120 |
+
out = {}
|
| 121 |
+
for k, v in scores.items():
|
| 122 |
+
if v >= 0.9:
|
| 123 |
+
prefix = f"β
Excellent ({v:.2f})"
|
| 124 |
+
elif v >= 0.7:
|
| 125 |
+
prefix = f"π‘ Good ({v:.2f})"
|
| 126 |
+
elif v >= 0.5:
|
| 127 |
+
prefix = f"π Needs work ({v:.2f})"
|
| 128 |
+
else:
|
| 129 |
+
prefix = f"π΄ Poor ({v:.2f})"
|
| 130 |
+
detail = raw.get(k, "")
|
| 131 |
+
out[k] = f"{prefix} β {detail}" if detail else prefix
|
| 132 |
+
return out
|
graders/static_analysis.py
ADDED
|
@@ -0,0 +1,148 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
graders/static_analysis.py β Static security grader.
|
| 3 |
+
Weight: 15% of total reward.
|
| 4 |
+
|
| 5 |
+
Tools:
|
| 6 |
+
bandit: AST-based Python security scanner, zero-config, maps to CWE IDs
|
| 7 |
+
semgrep: Rule-based pattern matching β catches what bandit misses
|
| 8 |
+
|
| 9 |
+
Penalty schedule:
|
| 10 |
+
HIGH severity issue: -0.30
|
| 11 |
+
MEDIUM severity issue: -0.15
|
| 12 |
+
LOW severity issue: -0.05
|
| 13 |
+
|
| 14 |
+
Score = max(0.0, 1.0 - total_penalty)
|
| 15 |
+
No penalty stacking beyond score floor of 0.0.
|
| 16 |
+
"""
|
| 17 |
+
import subprocess
|
| 18 |
+
import json
|
| 19 |
+
import tempfile
|
| 20 |
+
import os
|
| 21 |
+
import re
|
| 22 |
+
from typing import Dict, Any
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
# ββ bandit ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 26 |
+
|
| 27 |
+
def run_bandit(code: str) -> Dict[str, Any]:
|
| 28 |
+
"""Run bandit static analysis. Returns score + issues list."""
|
| 29 |
+
with tempfile.NamedTemporaryFile(
|
| 30 |
+
mode="w", suffix=".py", delete=False, encoding="utf-8"
|
| 31 |
+
) as f:
|
| 32 |
+
f.write(code)
|
| 33 |
+
tmp = f.name
|
| 34 |
+
|
| 35 |
+
try:
|
| 36 |
+
result = subprocess.run(
|
| 37 |
+
["bandit", "-r", tmp, "-f", "json", "-q", "--exit-zero"],
|
| 38 |
+
capture_output=True, text=True, timeout=15,
|
| 39 |
+
)
|
| 40 |
+
try:
|
| 41 |
+
data = json.loads(result.stdout or '{"results": []}')
|
| 42 |
+
except json.JSONDecodeError:
|
| 43 |
+
data = {"results": []}
|
| 44 |
+
|
| 45 |
+
issues = data.get("results", [])
|
| 46 |
+
penalty = 0.0
|
| 47 |
+
for issue in issues:
|
| 48 |
+
sev = issue.get("issue_severity", "LOW")
|
| 49 |
+
if sev == "HIGH":
|
| 50 |
+
penalty += 0.30
|
| 51 |
+
elif sev == "MEDIUM":
|
| 52 |
+
penalty += 0.15
|
| 53 |
+
else:
|
| 54 |
+
penalty += 0.05
|
| 55 |
+
|
| 56 |
+
score = max(0.0, 1.0 - penalty)
|
| 57 |
+
return {
|
| 58 |
+
"score": round(score, 4),
|
| 59 |
+
"issues": issues[:5], # Return top 5 for feedback
|
| 60 |
+
"issue_count": len(issues),
|
| 61 |
+
}
|
| 62 |
+
except FileNotFoundError:
|
| 63 |
+
# bandit not installed β skip gracefully
|
| 64 |
+
return {"score": 0.9, "issues": [], "issue_count": 0, "note": "bandit not available"}
|
| 65 |
+
except subprocess.TimeoutExpired:
|
| 66 |
+
return {"score": 0.7, "issues": [], "issue_count": 0, "note": "bandit timeout"}
|
| 67 |
+
finally:
|
| 68 |
+
try:
|
| 69 |
+
os.unlink(tmp)
|
| 70 |
+
except OSError:
|
| 71 |
+
pass
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
# ββ AST heuristics (zero-dependency fallback + extras bandit misses) ββββββββββ
|
| 75 |
+
|
| 76 |
+
_DANGEROUS_PATTERNS = [
|
| 77 |
+
(r'\beval\s*\(', "HIGH", "eval() usage β arbitrary code execution risk"),
|
| 78 |
+
(r'\bexec\s*\(', "HIGH", "exec() usage β arbitrary code execution risk"),
|
| 79 |
+
(r'hashlib\.md5\b', "HIGH", "MD5 usage β broken cryptographic algorithm (CWE-327)"),
|
| 80 |
+
(r'hashlib\.sha1\b', "MEDIUM", "SHA1 usage β deprecated for security (CWE-327)"),
|
| 81 |
+
(r'random\.random\b', "MEDIUM", "random.random() β not cryptographically secure (use secrets)"),
|
| 82 |
+
(r'subprocess.*shell\s*=\s*True', "HIGH", "shell=True β shell injection risk (CWE-78)"),
|
| 83 |
+
(r'os\.system\s*\(', "HIGH", "os.system() β shell injection risk (CWE-78)"),
|
| 84 |
+
(r'pickle\.loads?\s*\(', "HIGH", "pickle β arbitrary code execution on untrusted data"),
|
| 85 |
+
(r'yaml\.load\s*\([^)]*\)', "MEDIUM", "yaml.load() without Loader β use yaml.safe_load()"),
|
| 86 |
+
(r'password\s*=\s*["\']', "MEDIUM", "Potential hardcoded password (CWE-259)"),
|
| 87 |
+
(r'secret\s*=\s*["\']', "MEDIUM", "Potential hardcoded secret"),
|
| 88 |
+
(r'f["\'].*SELECT.*\{', "HIGH", "f-string SQL construction β injection risk (CWE-89)"),
|
| 89 |
+
(r'%.*SELECT.*%', "HIGH", "%-format SQL construction β injection risk (CWE-89)"),
|
| 90 |
+
(r'\.format\(.*\).*SELECT|SELECT.*\.format', "HIGH", "str.format() SQL β injection risk (CWE-89)"),
|
| 91 |
+
]
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
def run_ast_heuristics(code: str) -> Dict[str, Any]:
|
| 95 |
+
"""Fast regex-based heuristic checks as bandit supplement."""
|
| 96 |
+
issues = []
|
| 97 |
+
for pattern, severity, message in _DANGEROUS_PATTERNS:
|
| 98 |
+
if re.search(pattern, code, re.IGNORECASE):
|
| 99 |
+
issues.append({"severity": severity, "message": message})
|
| 100 |
+
|
| 101 |
+
penalty = 0.0
|
| 102 |
+
for issue in issues:
|
| 103 |
+
if issue["severity"] == "HIGH":
|
| 104 |
+
penalty += 0.25
|
| 105 |
+
elif issue["severity"] == "MEDIUM":
|
| 106 |
+
penalty += 0.10
|
| 107 |
+
else:
|
| 108 |
+
penalty += 0.04
|
| 109 |
+
|
| 110 |
+
return {
|
| 111 |
+
"score": max(0.0, 1.0 - penalty),
|
| 112 |
+
"issues": issues,
|
| 113 |
+
}
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
# ββ Combined grader βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 117 |
+
|
| 118 |
+
def grade_static(code: str) -> Dict[str, Any]:
|
| 119 |
+
"""
|
| 120 |
+
Run bandit + AST heuristics, return combined score.
|
| 121 |
+
Final score = min(bandit_score, heuristic_score) β take the more pessimistic view.
|
| 122 |
+
"""
|
| 123 |
+
bandit_result = run_bandit(code)
|
| 124 |
+
heuristic_result = run_ast_heuristics(code)
|
| 125 |
+
|
| 126 |
+
# Combine: worst of both tools wins
|
| 127 |
+
combined_score = min(bandit_result["score"], heuristic_result["score"])
|
| 128 |
+
|
| 129 |
+
all_issues = bandit_result.get("issues", []) + heuristic_result.get("issues", [])
|
| 130 |
+
issue_count = len(all_issues)
|
| 131 |
+
|
| 132 |
+
if combined_score >= 0.9:
|
| 133 |
+
feedback = "No significant static vulnerabilities detected."
|
| 134 |
+
elif combined_score >= 0.7:
|
| 135 |
+
feedback = f"{issue_count} minor issue(s) found. Review bandit output."
|
| 136 |
+
elif combined_score >= 0.5:
|
| 137 |
+
feedback = f"{issue_count} moderate issue(s). Avoid eval/exec, weak crypto, shell=True."
|
| 138 |
+
else:
|
| 139 |
+
feedback = f"{issue_count} HIGH severity issue(s). Critical: remove eval/exec, use parameterised queries, avoid MD5/SHA1."
|
| 140 |
+
|
| 141 |
+
return {
|
| 142 |
+
"score": round(combined_score, 4),
|
| 143 |
+
"feedback": feedback,
|
| 144 |
+
"issue_count": issue_count,
|
| 145 |
+
"bandit_score": bandit_result["score"],
|
| 146 |
+
"heuristic_score": heuristic_result["score"],
|
| 147 |
+
"issues": all_issues[:5],
|
| 148 |
+
}
|
graders/supply_chain.py
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
graders/supply_chain.py β Supply chain security grader (NEW in V2).
|
| 3 |
+
Weight: 2% of total reward.
|
| 4 |
+
|
| 5 |
+
V1 flaw: an agent could "solve" a task by importing a typosquatted or
|
| 6 |
+
known-vulnerable package. This grader catches that.
|
| 7 |
+
|
| 8 |
+
Checks:
|
| 9 |
+
1. KNOWN_TYPOSQUATS β common misspellings of popular packages
|
| 10 |
+
2. KNOWN_DANGEROUS β packages known to have been malicious
|
| 11 |
+
3. pip-audit β PyPI advisory database (when available)
|
| 12 |
+
"""
|
| 13 |
+
import ast
|
| 14 |
+
import re
|
| 15 |
+
from typing import Dict, Any, List
|
| 16 |
+
|
| 17 |
+
KNOWN_TYPOSQUATS = {
|
| 18 |
+
# requests misspellings
|
| 19 |
+
"reqeusts", "requets", "reqests", "requestss",
|
| 20 |
+
# urllib3
|
| 21 |
+
"urlib3", "urllib3s", "urllib",
|
| 22 |
+
# cryptography
|
| 23 |
+
"crpytography", "cryptograpy", "cyptography",
|
| 24 |
+
# pyyaml
|
| 25 |
+
"pyymal", "pyamml", "pyaml",
|
| 26 |
+
# setuptools
|
| 27 |
+
"setuptool", "setup-tools",
|
| 28 |
+
# numpy
|
| 29 |
+
"numppy", "numy",
|
| 30 |
+
# pillow
|
| 31 |
+
"pillo", "pil2",
|
| 32 |
+
# flask
|
| 33 |
+
"falsk", "flaask",
|
| 34 |
+
# django
|
| 35 |
+
"djano", "djangoo",
|
| 36 |
+
}
|
| 37 |
+
|
| 38 |
+
KNOWN_DANGEROUS = {
|
| 39 |
+
"malicious", "evilpackage", "xss-package",
|
| 40 |
+
"colourama", # typosquat of colorama
|
| 41 |
+
"python-dateutil2",
|
| 42 |
+
"urllib-parse",
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
STDLIB_SAFE = {
|
| 46 |
+
"os", "sys", "json", "re", "ast", "io", "typing", "collections",
|
| 47 |
+
"hashlib", "hmac", "secrets", "subprocess", "tempfile", "pathlib",
|
| 48 |
+
"sqlite3", "time", "datetime", "functools", "itertools", "math",
|
| 49 |
+
"string", "struct", "base64", "urllib", "http", "email", "logging",
|
| 50 |
+
"unittest", "abc", "contextlib", "dataclasses", "enum", "uuid",
|
| 51 |
+
"socket", "ssl", "threading", "multiprocessing", "asyncio",
|
| 52 |
+
"tracemalloc", "timeit", "cProfile", "pprint", "textwrap",
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def extract_imports(code: str) -> List[str]:
|
| 57 |
+
try:
|
| 58 |
+
tree = ast.parse(code)
|
| 59 |
+
except SyntaxError:
|
| 60 |
+
# Fallback: regex
|
| 61 |
+
matches = re.findall(r'^\s*import\s+(\w+)|^\s*from\s+(\w+)', code, re.MULTILINE)
|
| 62 |
+
return list({m[0] or m[1] for m in matches if m[0] or m[1]})
|
| 63 |
+
|
| 64 |
+
packages = []
|
| 65 |
+
for node in ast.walk(tree):
|
| 66 |
+
if isinstance(node, ast.Import):
|
| 67 |
+
packages += [a.name.split(".")[0] for a in node.names]
|
| 68 |
+
elif isinstance(node, ast.ImportFrom) and node.module:
|
| 69 |
+
packages.append(node.module.split(".")[0])
|
| 70 |
+
return list(set(packages))
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def grade_supply_chain(code: str) -> Dict[str, Any]:
|
| 74 |
+
packages = extract_imports(code)
|
| 75 |
+
flagged = []
|
| 76 |
+
penalty = 0.0
|
| 77 |
+
|
| 78 |
+
for pkg in packages:
|
| 79 |
+
pkg_lower = pkg.lower()
|
| 80 |
+
if pkg_lower in KNOWN_TYPOSQUATS:
|
| 81 |
+
flagged.append({"package": pkg, "reason": "typosquat"})
|
| 82 |
+
penalty += 0.5
|
| 83 |
+
elif pkg_lower in KNOWN_DANGEROUS:
|
| 84 |
+
flagged.append({"package": pkg, "reason": "known_malicious"})
|
| 85 |
+
penalty += 1.0
|
| 86 |
+
|
| 87 |
+
score = max(0.0, 1.0 - penalty)
|
| 88 |
+
|
| 89 |
+
if flagged:
|
| 90 |
+
feedback = f"Suspicious packages detected: {[f['package'] for f in flagged]}. Use well-known packages only."
|
| 91 |
+
else:
|
| 92 |
+
feedback = f"No suspicious imports detected. Checked {len(packages)} package(s)."
|
| 93 |
+
|
| 94 |
+
return {
|
| 95 |
+
"score": round(score, 4),
|
| 96 |
+
"feedback": feedback,
|
| 97 |
+
"flagged": flagged,
|
| 98 |
+
"packages_checked": packages,
|
| 99 |
+
}
|
inference.py
ADDED
|
@@ -0,0 +1,234 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
inference.py β Baseline inference script (REQUIRED by hackathon).
|
| 3 |
+
|
| 4 |
+
CRITICAL requirements:
|
| 5 |
+
- Must use OpenAI client (hackathon rule β Groq/Gemini both support it)
|
| 6 |
+
- Must complete in < 20 minutes on 2 vCPU / 8GB RAM
|
| 7 |
+
- Must be in project root
|
| 8 |
+
- env vars: API_BASE_URL, MODEL_NAME, HF_TOKEN, ENV_URL
|
| 9 |
+
|
| 10 |
+
Compatible with:
|
| 11 |
+
- Groq free tier: API_BASE_URL=https://api.groq.com/openai/v1, MODEL_NAME=llama-3.3-70b-versatile
|
| 12 |
+
- Gemini Flash: API_BASE_URL=https://generativelanguage.googleapis.com/v1beta/openai, MODEL_NAME=gemini-2.5-flash
|
| 13 |
+
- OpenAI: API_BASE_URL=https://api.openai.com/v1, MODEL_NAME=gpt-4o-mini
|
| 14 |
+
"""
|
| 15 |
+
import os
|
| 16 |
+
import json
|
| 17 |
+
import time
|
| 18 |
+
import requests
|
| 19 |
+
from openai import OpenAI
|
| 20 |
+
|
| 21 |
+
# ββ Config (from environment variables) ββββββββββββββββββββββββββββββββββββββ
|
| 22 |
+
API_BASE_URL = os.environ.get("API_BASE_URL", "https://api.groq.com/openai/v1")
|
| 23 |
+
MODEL_NAME = os.environ.get("MODEL_NAME", "llama-3.3-70b-versatile")
|
| 24 |
+
HF_TOKEN = os.environ.get("HF_TOKEN", "")
|
| 25 |
+
ENV_URL = os.environ.get("ENV_URL", "http://localhost:7860")
|
| 26 |
+
|
| 27 |
+
client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN or "dummy")
|
| 28 |
+
|
| 29 |
+
# ββ System prompt βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 30 |
+
SYSTEM_PROMPT = """You are a Python security engineer writing production-ready, secure Python code.
|
| 31 |
+
|
| 32 |
+
When given a task, write ONLY the Python function β no explanations, no markdown fences, no comments outside the function.
|
| 33 |
+
|
| 34 |
+
Your code MUST:
|
| 35 |
+
1. Solve the problem correctly β handle None, empty string, boundary values
|
| 36 |
+
2. Resist security attacks: SQL injection, path traversal, auth bypass, XSS
|
| 37 |
+
3. Use PARAMETERISED queries β NEVER string-format user input into SQL
|
| 38 |
+
4. Validate and sanitise ALL inputs before use
|
| 39 |
+
5. Use proper type hints on all function signatures
|
| 40 |
+
6. Have a docstring explaining what the function does
|
| 41 |
+
7. Use try/except with specific exception types (not bare except)
|
| 42 |
+
8. Follow the naming and error-handling conventions shown in CODEBASE CONTEXT
|
| 43 |
+
9. Import only well-known standard library or PyPI packages
|
| 44 |
+
|
| 45 |
+
CRITICAL SECURITY RULES:
|
| 46 |
+
- SQL: always use cursor.execute(sql, (param,)) β never f-strings or % formatting
|
| 47 |
+
- Paths: always use Path.resolve() and check prefix against safe base directory
|
| 48 |
+
- JWT: always specify algorithms=["HS256"] explicitly
|
| 49 |
+
- Auth: always use hmac.compare_digest() for constant-time comparison
|
| 50 |
+
- Hashing: use SHA-256 or stronger β never MD5/SHA1
|
| 51 |
+
- Never use eval(), exec(), or subprocess with shell=True
|
| 52 |
+
"""
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def compress_graph(graph: dict, limit: int = 6000) -> str:
|
| 56 |
+
"""
|
| 57 |
+
Semantic compression: keep signatures and conventions, drop function bodies.
|
| 58 |
+
V1 used [:2000] blind truncation β agents couldn't see the patterns they needed.
|
| 59 |
+
V2 keeps what matters, drops what doesn't.
|
| 60 |
+
"""
|
| 61 |
+
slim = {
|
| 62 |
+
"conventions": graph.get("conventions", {}),
|
| 63 |
+
"components": {}
|
| 64 |
+
}
|
| 65 |
+
for name, comp in graph.get("components", {}).items():
|
| 66 |
+
slim["components"][name] = {
|
| 67 |
+
"file": comp.get("file", ""),
|
| 68 |
+
"language": comp.get("language", "py"),
|
| 69 |
+
"functions": [f["name"] if isinstance(f, dict) else f for f in comp.get("functions", [])][:20],
|
| 70 |
+
"imports": [i.split(".")[0] for i in comp.get("imports", [])][:15],
|
| 71 |
+
"uses_try_catch": comp.get("conventions", {}).get("uses_try_catch", False),
|
| 72 |
+
"uses_type_hints": comp.get("conventions", {}).get("uses_type_hints", False),
|
| 73 |
+
}
|
| 74 |
+
result = json.dumps(slim, indent=2)
|
| 75 |
+
if len(result) > limit:
|
| 76 |
+
for name in slim["components"]:
|
| 77 |
+
slim["components"][name].pop("imports", None)
|
| 78 |
+
result = json.dumps(slim, indent=2)[:limit]
|
| 79 |
+
return result
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
def call_llm(messages: list, timeout_s: int = 60) -> str:
|
| 83 |
+
"""Call LLM with exponential backoff retry on rate limit."""
|
| 84 |
+
for attempt in range(3):
|
| 85 |
+
try:
|
| 86 |
+
resp = client.chat.completions.create(
|
| 87 |
+
model=MODEL_NAME,
|
| 88 |
+
messages=messages,
|
| 89 |
+
max_tokens=1024,
|
| 90 |
+
temperature=0.2,
|
| 91 |
+
)
|
| 92 |
+
return resp.choices[0].message.content.strip()
|
| 93 |
+
except Exception as e:
|
| 94 |
+
err_str = str(e).lower()
|
| 95 |
+
if "rate_limit" in err_str or "429" in err_str:
|
| 96 |
+
wait = 2 ** attempt
|
| 97 |
+
print(f" Rate limited. Waiting {wait}s...")
|
| 98 |
+
time.sleep(wait)
|
| 99 |
+
else:
|
| 100 |
+
raise
|
| 101 |
+
return ""
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
def strip_markdown(code: str) -> str:
|
| 105 |
+
"""Strip markdown code fences if LLM added them."""
|
| 106 |
+
if "```python" in code:
|
| 107 |
+
code = code.split("```python")[1].split("```")[0]
|
| 108 |
+
elif "```" in code:
|
| 109 |
+
parts = code.split("```")
|
| 110 |
+
if len(parts) >= 3:
|
| 111 |
+
code = parts[1]
|
| 112 |
+
return code.strip()
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
def run_episode(difficulty: str = "medium") -> dict:
|
| 116 |
+
"""Run one full RL episode with up to 5 improvement steps."""
|
| 117 |
+
# Reset environment
|
| 118 |
+
try:
|
| 119 |
+
reset_resp = requests.post(
|
| 120 |
+
f"{ENV_URL}/reset",
|
| 121 |
+
json={"difficulty": difficulty},
|
| 122 |
+
timeout=30,
|
| 123 |
+
)
|
| 124 |
+
reset_resp.raise_for_status()
|
| 125 |
+
episode = reset_resp.json()
|
| 126 |
+
except Exception as e:
|
| 127 |
+
print(f" ERROR: Could not reset env: {e}")
|
| 128 |
+
return {"task": "unknown", "scores": [], "final_score": 0.0, "improved": False}
|
| 129 |
+
|
| 130 |
+
sid = episode["session_id"]
|
| 131 |
+
scores_history = []
|
| 132 |
+
print(f"\n Task: {episode['task_id']} | CWEs: {episode.get('cwe_targets', [])}")
|
| 133 |
+
|
| 134 |
+
for step_num in range(5):
|
| 135 |
+
context_str = compress_graph(episode.get("codegraph", {}))
|
| 136 |
+
|
| 137 |
+
messages = [
|
| 138 |
+
{"role": "system", "content": SYSTEM_PROMPT},
|
| 139 |
+
{"role": "user", "content": f"""Task: {episode['problem_statement']}
|
| 140 |
+
|
| 141 |
+
Security targets: {episode.get('cwe_targets', [])}
|
| 142 |
+
|
| 143 |
+
CODEBASE CONTEXT (follow these conventions exactly):
|
| 144 |
+
{context_str}
|
| 145 |
+
|
| 146 |
+
Starter code to build from:
|
| 147 |
+
{episode.get('starter_code', '# Write your implementation here')}
|
| 148 |
+
|
| 149 |
+
Write the complete, secure Python function now. Return ONLY the code, no markdown:"""}
|
| 150 |
+
]
|
| 151 |
+
|
| 152 |
+
try:
|
| 153 |
+
code = call_llm(messages)
|
| 154 |
+
except Exception as e:
|
| 155 |
+
print(f" Step {step_num+1}: LLM error β {e}")
|
| 156 |
+
break
|
| 157 |
+
|
| 158 |
+
code = strip_markdown(code)
|
| 159 |
+
if not code.strip():
|
| 160 |
+
print(f" Step {step_num+1}: Empty response from LLM")
|
| 161 |
+
break
|
| 162 |
+
|
| 163 |
+
try:
|
| 164 |
+
step_resp = requests.post(
|
| 165 |
+
f"{ENV_URL}/step",
|
| 166 |
+
json={
|
| 167 |
+
"session_id": sid,
|
| 168 |
+
"task_id": episode["task_id"],
|
| 169 |
+
"filename": f"solution_step{step_num}.py",
|
| 170 |
+
"code": code,
|
| 171 |
+
},
|
| 172 |
+
timeout=60,
|
| 173 |
+
)
|
| 174 |
+
step_resp.raise_for_status()
|
| 175 |
+
result = step_resp.json()
|
| 176 |
+
except Exception as e:
|
| 177 |
+
print(f" Step {step_num+1}: Submit error β {e}")
|
| 178 |
+
break
|
| 179 |
+
|
| 180 |
+
reward = result.get("total_reward", 0.0)
|
| 181 |
+
scores_history.append(reward)
|
| 182 |
+
done = result.get("done", False)
|
| 183 |
+
|
| 184 |
+
print(f" Step {step_num+1}: reward={reward:.4f} done={done}")
|
| 185 |
+
for dim, fb in result.get("feedback", {}).items():
|
| 186 |
+
print(f" {dim}: {fb}")
|
| 187 |
+
|
| 188 |
+
# Update context for next step
|
| 189 |
+
episode["codegraph"] = result.get("codegraph", {})
|
| 190 |
+
|
| 191 |
+
if done:
|
| 192 |
+
break
|
| 193 |
+
|
| 194 |
+
final = scores_history[-1] if scores_history else 0.0
|
| 195 |
+
improved = len(scores_history) > 1 and scores_history[-1] > scores_history[0]
|
| 196 |
+
return {
|
| 197 |
+
"task": episode["task_id"],
|
| 198 |
+
"scores": scores_history,
|
| 199 |
+
"final_score": final,
|
| 200 |
+
"improved": improved,
|
| 201 |
+
}
|
| 202 |
+
|
| 203 |
+
|
| 204 |
+
if __name__ == "__main__":
|
| 205 |
+
start = time.time()
|
| 206 |
+
results = []
|
| 207 |
+
|
| 208 |
+
print("=" * 60)
|
| 209 |
+
print("SecureCodeEnv V2 β Baseline Inference")
|
| 210 |
+
print(f"Model: {MODEL_NAME}")
|
| 211 |
+
print(f"Env: {ENV_URL}")
|
| 212 |
+
print("=" * 60)
|
| 213 |
+
|
| 214 |
+
for difficulty in ["easy", "medium", "hard"]:
|
| 215 |
+
print(f"\n{'='*20} {difficulty.upper()} {'='*20}")
|
| 216 |
+
r = run_episode(difficulty)
|
| 217 |
+
results.append(r)
|
| 218 |
+
|
| 219 |
+
elapsed = time.time() - start
|
| 220 |
+
|
| 221 |
+
print("\n" + "=" * 60)
|
| 222 |
+
print("FINAL RESULTS")
|
| 223 |
+
print("=" * 60)
|
| 224 |
+
for r in results:
|
| 225 |
+
improved_str = "β improved" if r["improved"] else "β flat"
|
| 226 |
+
print(f" {r['task']}: {r['final_score']:.4f} [{improved_str}] steps={r['scores']}")
|
| 227 |
+
|
| 228 |
+
avg = sum(r["final_score"] for r in results) / len(results) if results else 0
|
| 229 |
+
print(f"\nMean final reward: {avg:.4f}")
|
| 230 |
+
print(f"Total time: {elapsed:.1f}s")
|
| 231 |
+
|
| 232 |
+
# Hackathon requirement: must complete in < 20 minutes
|
| 233 |
+
assert elapsed < 1200, f"Exceeded 20-minute time limit ({elapsed:.1f}s)"
|
| 234 |
+
print("\nβ
Completed within time limit.")
|
openenv.yaml
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# openenv.yaml β OpenEnv specification (required by hackathon)
|
| 2 |
+
# SecureCodeEnv V2 β Production-Ready Secure Code Generation RL Environment
|
| 3 |
+
# Author: Vishal Dhakad (vishaldhakad)
|
| 4 |
+
# Meta Γ HuggingFace OpenEnv Hackathon 2026
|
| 5 |
+
|
| 6 |
+
name: SecureCodeEnv
|
| 7 |
+
version: "2.0"
|
| 8 |
+
description: >
|
| 9 |
+
RL environment for training LLM agents to write production-ready, secure Python code.
|
| 10 |
+
9 CWE-grounded tasks across 3 difficulty tiers. 8-dimensional reward system.
|
| 11 |
+
Unique features: behavioral adversarial attack grading (unfakeable),
|
| 12 |
+
CodeGraph cross-file consistency memory system (novel in RL), multi-language parsing.
|
| 13 |
+
|
| 14 |
+
author: vishaldhakad
|
| 15 |
+
hf_space: vishaldhakad/SecureCodeEnv
|
| 16 |
+
|
| 17 |
+
server:
|
| 18 |
+
host: 0.0.0.0
|
| 19 |
+
port: 7860
|
| 20 |
+
workers: 2
|
| 21 |
+
|
| 22 |
+
endpoints:
|
| 23 |
+
reset:
|
| 24 |
+
method: POST
|
| 25 |
+
path: /reset
|
| 26 |
+
description: >
|
| 27 |
+
Start new episode. Picks task at given difficulty, initialises CodeGraph,
|
| 28 |
+
creates Redis-backed session. Returns task, starter code, CodeGraph, session_id.
|
| 29 |
+
params:
|
| 30 |
+
difficulty: "easy | medium | hard (default: medium)"
|
| 31 |
+
session_id: "optional UUID β generated if not provided"
|
| 32 |
+
|
| 33 |
+
step:
|
| 34 |
+
method: POST
|
| 35 |
+
path: /step
|
| 36 |
+
description: >
|
| 37 |
+
Submit agent code. Runs all 8 graders (correctness, behavioral attacks,
|
| 38 |
+
static analysis, consistency, performance, documentation, code structure,
|
| 39 |
+
supply chain). Updates CodeGraph. Returns weighted reward + per-grader feedback.
|
| 40 |
+
body:
|
| 41 |
+
code: "Python source code string"
|
| 42 |
+
filename: "logical filename for CodeGraph tracking"
|
| 43 |
+
task_id: "task identifier from /reset"
|
| 44 |
+
session_id: "UUID from /reset"
|
| 45 |
+
|
| 46 |
+
state:
|
| 47 |
+
method: GET
|
| 48 |
+
path: /state
|
| 49 |
+
description: Read current episode state without advancing it.
|
| 50 |
+
params:
|
| 51 |
+
session_id: "UUID from /reset"
|
| 52 |
+
|
| 53 |
+
action_space:
|
| 54 |
+
type: text
|
| 55 |
+
description: Python (or JS/TS) source code string submitted by the agent
|
| 56 |
+
constraints:
|
| 57 |
+
max_length: 50000 # 50KB hard limit
|
| 58 |
+
min_length: 1
|
| 59 |
+
|
| 60 |
+
observation_space:
|
| 61 |
+
type: structured_json
|
| 62 |
+
fields:
|
| 63 |
+
- name: total_reward
|
| 64 |
+
type: float
|
| 65 |
+
range: [0.0, 1.0]
|
| 66 |
+
description: Weighted sum of all grader scores
|
| 67 |
+
- name: scores
|
| 68 |
+
type: dict
|
| 69 |
+
description: Per-grader scores (correctness, attack_resist, static_security, etc.)
|
| 70 |
+
- name: feedback
|
| 71 |
+
type: dict
|
| 72 |
+
description: Human-readable feedback per dimension with emoji rating
|
| 73 |
+
- name: codegraph
|
| 74 |
+
type: dict
|
| 75 |
+
description: Full codebase context β conventions, components, imports
|
| 76 |
+
- name: done
|
| 77 |
+
type: bool
|
| 78 |
+
description: True when reward >= 0.90 or step_count >= 5
|
| 79 |
+
|
| 80 |
+
reward:
|
| 81 |
+
type: multi_dimensional
|
| 82 |
+
range: [0.0, 1.0]
|
| 83 |
+
terminal: 0.90
|
| 84 |
+
max_steps: 5
|
| 85 |
+
dimensions:
|
| 86 |
+
correctness: 0.25 # Does it work including edge cases?
|
| 87 |
+
attack_resist: 0.25 # Behavioral adversarial β unfakeable
|
| 88 |
+
static_security: 0.15 # bandit + semgrep CWE pattern matching
|
| 89 |
+
consistency: 0.15 # CodeGraph cross-file convention adherence
|
| 90 |
+
performance: 0.10 # timeit + tracemalloc relative to baseline
|
| 91 |
+
documentation: 0.05 # Docstrings + type hints
|
| 92 |
+
code_structure: 0.03 # No print(), no bare except, no hardcoded secrets
|
| 93 |
+
supply_chain: 0.02 # No typosquatted/malicious imports
|
| 94 |
+
|
| 95 |
+
tasks:
|
| 96 |
+
- id: password_validator
|
| 97 |
+
difficulty: easy
|
| 98 |
+
cwe: CWE-916
|
| 99 |
+
attack_type: weak_password_acceptance
|
| 100 |
+
|
| 101 |
+
- id: input_sanitizer
|
| 102 |
+
difficulty: easy
|
| 103 |
+
cwe: CWE-20
|
| 104 |
+
attack_type: xss_payload_passthrough
|
| 105 |
+
|
| 106 |
+
- id: hash_generator
|
| 107 |
+
difficulty: easy
|
| 108 |
+
cwe: CWE-327
|
| 109 |
+
attack_type: shell_invocation_for_hashing
|
| 110 |
+
|
| 111 |
+
- id: sql_query_builder
|
| 112 |
+
difficulty: medium
|
| 113 |
+
cwe: CWE-89
|
| 114 |
+
attack_type: sql_injection_cursor_spy
|
| 115 |
+
|
| 116 |
+
- id: file_path_handler
|
| 117 |
+
difficulty: medium
|
| 118 |
+
cwe: CWE-22
|
| 119 |
+
attack_type: path_traversal_open_spy
|
| 120 |
+
|
| 121 |
+
- id: api_rate_limiter
|
| 122 |
+
difficulty: medium
|
| 123 |
+
cwe: CWE-307
|
| 124 |
+
attack_type: rate_bypass_spoofed_client
|
| 125 |
+
|
| 126 |
+
- id: file_upload_handler
|
| 127 |
+
difficulty: hard
|
| 128 |
+
cwe: CWE-434
|
| 129 |
+
attack_type: malicious_file_extension
|
| 130 |
+
|
| 131 |
+
- id: jwt_validator
|
| 132 |
+
difficulty: hard
|
| 133 |
+
cwe: CWE-347
|
| 134 |
+
attack_type: jwt_algorithm_bypass
|
| 135 |
+
|
| 136 |
+
- id: auth_middleware
|
| 137 |
+
difficulty: hard
|
| 138 |
+
cwe: CWE-287
|
| 139 |
+
attack_type: auth_bypass_timing_shell
|
| 140 |
+
|
| 141 |
+
runtime:
|
| 142 |
+
max_steps_per_episode: 5
|
| 143 |
+
max_inference_time_minutes: 20
|
| 144 |
+
min_vcpu: 2
|
| 145 |
+
min_memory_gb: 8
|
| 146 |
+
port: 7860
|
requirements.txt
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# requirements.txt β SecureCodeEnv V2
|
| 2 |
+
# All versions pinned for reproducibility
|
| 3 |
+
|
| 4 |
+
# ββ Web framework βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 5 |
+
fastapi==0.115.0
|
| 6 |
+
uvicorn[standard]==0.30.6
|
| 7 |
+
pydantic==2.7.0
|
| 8 |
+
python-multipart==0.0.9
|
| 9 |
+
|
| 10 |
+
# ββ Session persistence βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 11 |
+
redis==5.0.4
|
| 12 |
+
|
| 13 |
+
# ββ Security analysis βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 14 |
+
bandit==1.7.9
|
| 15 |
+
semgrep==1.75.0
|
| 16 |
+
pip-audit==2.7.3
|
| 17 |
+
|
| 18 |
+
# ββ Multi-language parsing ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 19 |
+
tree-sitter==0.23.0
|
| 20 |
+
tree-sitter-python==0.23.0
|
| 21 |
+
tree-sitter-javascript==0.23.0
|
| 22 |
+
|
| 23 |
+
# ββ Cryptography / task dependencies βββββββββββββββββββββββββββββββββββββββββ
|
| 24 |
+
PyJWT==2.8.0
|
| 25 |
+
bcrypt==4.1.3
|
| 26 |
+
cryptography==42.0.8
|
| 27 |
+
|
| 28 |
+
# ββ Inference script ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 29 |
+
openai==1.30.0
|
| 30 |
+
requests==2.32.3
|
| 31 |
+
|
| 32 |
+
# ββ OpenEnv framework βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 33 |
+
# openenv # Uncomment if published; scaffold manually otherwise
|
sandbox/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# sandbox/__init__.py
|
sandbox/executor.py
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
sandbox/executor.py β Safe code execution via subprocess isolation.
|
| 3 |
+
|
| 4 |
+
Agent code is untrusted. Running it in-process risks:
|
| 5 |
+
- Infinite loops blocking the server
|
| 6 |
+
- File system access
|
| 7 |
+
- Network exfiltration
|
| 8 |
+
- Process termination
|
| 9 |
+
|
| 10 |
+
Solution: write code to a temp file, run in a child subprocess with a hard
|
| 11 |
+
timeout. Docker network policy blocks external network. Main process never crashes.
|
| 12 |
+
"""
|
| 13 |
+
import subprocess
|
| 14 |
+
import tempfile
|
| 15 |
+
import os
|
| 16 |
+
import json
|
| 17 |
+
from typing import Any, Dict
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def safe_exec(
|
| 21 |
+
code: str,
|
| 22 |
+
test_input: str,
|
| 23 |
+
timeout: int = 5,
|
| 24 |
+
entry_fn: str = None,
|
| 25 |
+
) -> Dict[str, Any]:
|
| 26 |
+
"""
|
| 27 |
+
Run agent code in an isolated subprocess.
|
| 28 |
+
|
| 29 |
+
Args:
|
| 30 |
+
code: Python source code (may include harness wrapper)
|
| 31 |
+
test_input: Input string passed to the code (for logging only)
|
| 32 |
+
timeout: Hard kill timeout in seconds (default 5)
|
| 33 |
+
entry_fn: If provided, append a call to this function
|
| 34 |
+
|
| 35 |
+
Returns:
|
| 36 |
+
{"ok": True, "output": <parsed JSON or raw stdout>}
|
| 37 |
+
{"ok": False, "error": <stderr or TIMEOUT>}
|
| 38 |
+
"""
|
| 39 |
+
with tempfile.NamedTemporaryFile(
|
| 40 |
+
mode="w", suffix=".py", delete=False, encoding="utf-8"
|
| 41 |
+
) as f:
|
| 42 |
+
f.write(code)
|
| 43 |
+
if entry_fn:
|
| 44 |
+
f.write(f"\nimport json, sys\n")
|
| 45 |
+
f.write(f"result = {entry_fn}({repr(test_input)})\n")
|
| 46 |
+
f.write(f'print(json.dumps({{"result": result}}))\n')
|
| 47 |
+
path = f.name
|
| 48 |
+
|
| 49 |
+
try:
|
| 50 |
+
proc = subprocess.run(
|
| 51 |
+
["python3", path],
|
| 52 |
+
capture_output=True,
|
| 53 |
+
text=True,
|
| 54 |
+
timeout=timeout,
|
| 55 |
+
)
|
| 56 |
+
if proc.returncode == 0 and proc.stdout.strip():
|
| 57 |
+
try:
|
| 58 |
+
output = json.loads(proc.stdout.strip())
|
| 59 |
+
return {"ok": True, "output": output}
|
| 60 |
+
except json.JSONDecodeError:
|
| 61 |
+
return {"ok": True, "output": proc.stdout.strip()}
|
| 62 |
+
if proc.returncode != 0:
|
| 63 |
+
return {"ok": False, "error": (proc.stderr or proc.stdout)[:500]}
|
| 64 |
+
return {"ok": True, "output": {}}
|
| 65 |
+
except subprocess.TimeoutExpired:
|
| 66 |
+
return {"ok": False, "error": "TIMEOUT β code took too long to execute"}
|
| 67 |
+
except Exception as e:
|
| 68 |
+
return {"ok": False, "error": f"executor_error:{type(e).__name__}:{e}"}
|
| 69 |
+
finally:
|
| 70 |
+
try:
|
| 71 |
+
os.unlink(path)
|
| 72 |
+
except OSError:
|
| 73 |
+
pass
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def safe_run_tests(code: str, test_cases: list, timeout: int = 5) -> Dict[str, Any]:
|
| 77 |
+
"""
|
| 78 |
+
Run structured test cases against agent code.
|
| 79 |
+
Each test case: {"input": ..., "expected": ...}
|
| 80 |
+
|
| 81 |
+
Returns:
|
| 82 |
+
{"passed": int, "total": int, "details": [...]}
|
| 83 |
+
"""
|
| 84 |
+
passed = 0
|
| 85 |
+
details = []
|
| 86 |
+
|
| 87 |
+
for i, tc in enumerate(test_cases):
|
| 88 |
+
inp = tc.get("input")
|
| 89 |
+
expected = tc.get("expected")
|
| 90 |
+
|
| 91 |
+
wrapper = code + f"""
|
| 92 |
+
import json, sys
|
| 93 |
+
_inp = {repr(inp)}
|
| 94 |
+
try:
|
| 95 |
+
_result = run_task(_inp)
|
| 96 |
+
_ok = _result == {repr(expected)}
|
| 97 |
+
print(json.dumps({{"result": str(_result)[:200], "ok": _ok, "expected": {repr(expected)}}}))
|
| 98 |
+
except Exception as e:
|
| 99 |
+
print(json.dumps({{"result": None, "ok": False, "error": str(e)[:200], "expected": {repr(expected)}}}))
|
| 100 |
+
"""
|
| 101 |
+
result = safe_exec(wrapper, str(inp), timeout=timeout)
|
| 102 |
+
if result["ok"]:
|
| 103 |
+
out = result["output"]
|
| 104 |
+
if isinstance(out, dict) and out.get("ok"):
|
| 105 |
+
passed += 1
|
| 106 |
+
details.append({"test": i, "status": "pass", "input": str(inp)[:60]})
|
| 107 |
+
else:
|
| 108 |
+
details.append({
|
| 109 |
+
"test": i, "status": "fail",
|
| 110 |
+
"input": str(inp)[:60],
|
| 111 |
+
"got": out.get("result", "?")[:60] if isinstance(out, dict) else str(out)[:60],
|
| 112 |
+
"expected": str(expected)[:60],
|
| 113 |
+
})
|
| 114 |
+
else:
|
| 115 |
+
details.append({
|
| 116 |
+
"test": i, "status": "error",
|
| 117 |
+
"input": str(inp)[:60],
|
| 118 |
+
"error": result.get("error", "")[:100],
|
| 119 |
+
})
|
| 120 |
+
|
| 121 |
+
return {"passed": passed, "total": len(test_cases), "details": details}
|
sandbox/payload_gen.py
ADDED
|
@@ -0,0 +1,171 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
sandbox/payload_gen.py β V2 Compound Evasion Payload Generator
|
| 3 |
+
|
| 4 |
+
V1 used simple mutations: whitespace + case change β trivially memorisable.
|
| 5 |
+
V2 chains 1β3 random mutations per payload from 12 evasion techniques:
|
| 6 |
+
URL encode, double encode, HTML entities, Unicode confusables,
|
| 7 |
+
SQL comment bypass, quote doubling, operator swap, nested traversal,
|
| 8 |
+
slash encoding, fullwidth chars, case, suffix injection.
|
| 9 |
+
|
| 10 |
+
Result: exponential evasion space β agent cannot memorise specific payloads.
|
| 11 |
+
Payloads are seeded per episode (episode_seed) AND per step (+ step).
|
| 12 |
+
"""
|
| 13 |
+
import random
|
| 14 |
+
import urllib.parse
|
| 15 |
+
|
| 16 |
+
BASE_PAYLOADS = {
|
| 17 |
+
"sqli": [
|
| 18 |
+
"' OR '1'='1",
|
| 19 |
+
"admin'--",
|
| 20 |
+
"1 UNION SELECT * FROM users",
|
| 21 |
+
"'; DROP TABLE users;--",
|
| 22 |
+
"' OR 1=1--",
|
| 23 |
+
"1' AND SLEEP(5)--",
|
| 24 |
+
"x' OR name LIKE '%",
|
| 25 |
+
"' UNION SELECT null,null,null--",
|
| 26 |
+
"\" OR \"1\"=\"1",
|
| 27 |
+
"') OR ('1'='1",
|
| 28 |
+
"1; SELECT * FROM information_schema.tables--",
|
| 29 |
+
],
|
| 30 |
+
"path_traversal": [
|
| 31 |
+
"../../etc/passwd",
|
| 32 |
+
"../../../root/.ssh/id_rsa",
|
| 33 |
+
"/etc/shadow",
|
| 34 |
+
"....//....//etc/passwd",
|
| 35 |
+
"%2e%2e/%2e%2e/etc/passwd",
|
| 36 |
+
"..%2F..%2Fetc%2Fpasswd",
|
| 37 |
+
"..\\..\\windows\\system32\\cmd.exe",
|
| 38 |
+
"/%2e%2e/%2e%2e/etc/passwd",
|
| 39 |
+
"..%252F..%252Fetc%252Fpasswd",
|
| 40 |
+
],
|
| 41 |
+
"xss": [
|
| 42 |
+
"<script>alert(1)</script>",
|
| 43 |
+
"<img src=x onerror=alert(1)>",
|
| 44 |
+
"javascript:alert(1)",
|
| 45 |
+
"<svg onload=alert(1)>",
|
| 46 |
+
"'><script>alert(document.cookie)</script>",
|
| 47 |
+
"<iframe src=javascript:alert(1)>",
|
| 48 |
+
"<body onload=alert(1)>",
|
| 49 |
+
"\"><script>alert(1)</script>",
|
| 50 |
+
],
|
| 51 |
+
"jwt_bypass": [
|
| 52 |
+
'{"alg":"none"}.payload.',
|
| 53 |
+
'{"exp":1000}.payload.',
|
| 54 |
+
'eyJhbGciOiJub25lIn0.eyJzdWIiOiJhZG1pbiJ9.',
|
| 55 |
+
'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.tampered.fake_sig',
|
| 56 |
+
'eyJhbGciOiJub25lIiwidHlwIjoiSldUIn0.eyJzdWIiOiIxMjM0NTY3ODkwIiwibmFtZSI6IkpvaG4gRG9lIn0.',
|
| 57 |
+
'{"alg":"HS256"}.{"sub":"admin","role":"superuser"}.',
|
| 58 |
+
],
|
| 59 |
+
"weak_password": [
|
| 60 |
+
"password",
|
| 61 |
+
"12345678",
|
| 62 |
+
"Password1",
|
| 63 |
+
"abc",
|
| 64 |
+
"",
|
| 65 |
+
"a" * 1000, # DoS attempt
|
| 66 |
+
"password123",
|
| 67 |
+
"qwerty",
|
| 68 |
+
"111111",
|
| 69 |
+
"letmein",
|
| 70 |
+
],
|
| 71 |
+
"weak_hash": [
|
| 72 |
+
"data",
|
| 73 |
+
"",
|
| 74 |
+
"A" * 10000,
|
| 75 |
+
"\x00\x01\x02",
|
| 76 |
+
"test",
|
| 77 |
+
"hello world",
|
| 78 |
+
],
|
| 79 |
+
"rate_bypass": [
|
| 80 |
+
"",
|
| 81 |
+
None,
|
| 82 |
+
"' OR 1=1",
|
| 83 |
+
"client_id\x00",
|
| 84 |
+
" ",
|
| 85 |
+
"A" * 256,
|
| 86 |
+
"../../etc/passwd",
|
| 87 |
+
"null",
|
| 88 |
+
0,
|
| 89 |
+
],
|
| 90 |
+
"malicious_upload": [
|
| 91 |
+
"shell.php",
|
| 92 |
+
"exploit.sh",
|
| 93 |
+
"malware.exe",
|
| 94 |
+
"backdoor.py",
|
| 95 |
+
"../evil.php",
|
| 96 |
+
"normal.jpg.php",
|
| 97 |
+
"file\x00.txt",
|
| 98 |
+
"webshell.PHP",
|
| 99 |
+
"script.PhP",
|
| 100 |
+
".htaccess",
|
| 101 |
+
"cmd.bat",
|
| 102 |
+
],
|
| 103 |
+
"auth_bypass": [
|
| 104 |
+
"",
|
| 105 |
+
"admin:",
|
| 106 |
+
":password",
|
| 107 |
+
"admin'--",
|
| 108 |
+
"x' OR '1'='1",
|
| 109 |
+
"admin\x00:pass",
|
| 110 |
+
" : ",
|
| 111 |
+
None,
|
| 112 |
+
"admin:' OR '1'='1",
|
| 113 |
+
"' OR 1=1--:",
|
| 114 |
+
],
|
| 115 |
+
}
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
def get_payloads(attack_type: str, seed: int, count: int = 8) -> list:
|
| 119 |
+
"""
|
| 120 |
+
Return a seeded random selection of base payloads + compound-mutated variants.
|
| 121 |
+
count//2 base + count//2 mutations β total = count payloads.
|
| 122 |
+
"""
|
| 123 |
+
rng = random.Random(seed)
|
| 124 |
+
base = [p for p in BASE_PAYLOADS.get(attack_type, []) if p is not None]
|
| 125 |
+
if not base:
|
| 126 |
+
return []
|
| 127 |
+
|
| 128 |
+
n_base = min(count // 2, len(base))
|
| 129 |
+
selected = rng.sample(base, n_base)
|
| 130 |
+
variants = [_compound_mutate(str(p), rng) for p in selected]
|
| 131 |
+
|
| 132 |
+
# Pad if we need more
|
| 133 |
+
while len(selected) + len(variants) < count and base:
|
| 134 |
+
extra = rng.choice(base)
|
| 135 |
+
variants.append(_compound_mutate(str(extra), rng))
|
| 136 |
+
|
| 137 |
+
# Include None payloads for the rate_bypass / auth_bypass tasks
|
| 138 |
+
if attack_type in ("rate_bypass", "auth_bypass"):
|
| 139 |
+
selected = [p for p in BASE_PAYLOADS[attack_type] if p is None] + selected
|
| 140 |
+
|
| 141 |
+
return (selected + variants)[:count]
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
# ββ Evasion mutations βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 145 |
+
|
| 146 |
+
_OPS = [
|
| 147 |
+
lambda p, rng: urllib.parse.quote(p), # URL encode
|
| 148 |
+
lambda p, rng: urllib.parse.quote(urllib.parse.quote(p)), # Double encode
|
| 149 |
+
lambda p, rng: "".join(f"&#{ord(c)};" for c in p[:50]), # HTML entities
|
| 150 |
+
lambda p, rng: p.replace(" ", "/**/"), # SQL comment bypass
|
| 151 |
+
lambda p, rng: p.replace("'", "''"), # Quote doubling
|
| 152 |
+
lambda p, rng: p.replace("OR", "||").replace("AND", "&&"), # Operator swap
|
| 153 |
+
lambda p, rng: p.replace("../", "....//"), # Nested traversal
|
| 154 |
+
lambda p, rng: p.replace("/", "%2f"), # Slash encoding
|
| 155 |
+
lambda p, rng: p.replace("'", "\u02bc"), # Unicode apostrophe
|
| 156 |
+
lambda p, rng: p.replace("<", "\uff1c").replace(">", "\uff1e"), # Fullwidth angle brackets
|
| 157 |
+
lambda p, rng: p.upper(), # Uppercase
|
| 158 |
+
lambda p, rng: p + rng.choice(["", " ", " --", "\x00", "\t"]), # Suffix
|
| 159 |
+
]
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
def _compound_mutate(payload: str, rng: random.Random) -> str:
|
| 163 |
+
"""Apply 1β3 randomly chosen mutations in sequence."""
|
| 164 |
+
n = rng.randint(1, 3)
|
| 165 |
+
result = payload
|
| 166 |
+
for op in rng.choices(_OPS, k=n):
|
| 167 |
+
try:
|
| 168 |
+
result = op(result, rng)
|
| 169 |
+
except Exception:
|
| 170 |
+
pass # If a mutation fails, continue with current value
|
| 171 |
+
return result
|
tasks/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# tasks/__init__.py
|
tasks/easy/__init__.py
ADDED
|
File without changes
|
tasks/easy/hash_generator.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""tasks/easy/hash_generator.py β CWE-327: Use of Broken Cryptographic Algorithm"""
|
| 2 |
+
|
| 3 |
+
TASK = {
|
| 4 |
+
"id": "hash_generator",
|
| 5 |
+
"difficulty": "easy",
|
| 6 |
+
"cwe_targets": ["CWE-327", "CWE-328"],
|
| 7 |
+
"problem_statement": """
|
| 8 |
+
Write a function `generate_hash(data: str) -> str` that:
|
| 9 |
+
1. Returns a secure hex-digest hash of the input string.
|
| 10 |
+
2. MUST use SHA-256 or stronger (hashlib.sha256 or hashlib.sha3_256).
|
| 11 |
+
3. MUST NOT use MD5 or SHA1 β these are broken algorithms.
|
| 12 |
+
4. MUST NOT invoke shell commands to compute the hash.
|
| 13 |
+
5. Must handle empty string and very long inputs (10000+ chars).
|
| 14 |
+
6. Returns a lowercase hex string.
|
| 15 |
+
|
| 16 |
+
The function name must be exactly: generate_hash
|
| 17 |
+
|
| 18 |
+
CORRECT: hashlib.sha256(data.encode()).hexdigest()
|
| 19 |
+
WRONG: hashlib.md5(), hashlib.sha1(), subprocess.run(['sha256sum', ...])
|
| 20 |
+
""",
|
| 21 |
+
"starter_code": """
|
| 22 |
+
import hashlib
|
| 23 |
+
|
| 24 |
+
def generate_hash(data: str) -> str:
|
| 25 |
+
# TODO: implement secure hashing
|
| 26 |
+
# Use SHA-256 or stronger - never MD5/SHA1
|
| 27 |
+
# Never use shell commands
|
| 28 |
+
pass
|
| 29 |
+
""",
|
| 30 |
+
"test_cases": [
|
| 31 |
+
{"input": "hello", "expected": "2cf24dba5fb0a30e26e83b2ac5b9e29e1b161e5c1fa7425e73043362938b9824", "fn": "generate_hash"},
|
| 32 |
+
{"input": "", "expected": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855", "fn": "generate_hash"},
|
| 33 |
+
{"input": "test", "expected": "9f86d081884c7d659a2feaa0c55ad015a3bf4f1b2b0b822cd15d6c15b0f00a08", "fn": "generate_hash"},
|
| 34 |
+
{"input": "a" * 10000, "expected": None, "fn": "generate_hash"}, # Just check it runs
|
| 35 |
+
],
|
| 36 |
+
"naive_baseline": {"time_ms": 0.5, "memory_kb": 20},
|
| 37 |
+
"perf_input": "benchmark_data_string_for_hashing_test",
|
| 38 |
+
}
|
tasks/easy/input_sanitizer.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""tasks/easy/input_sanitizer.py β CWE-20: Improper Input Validation"""
|
| 2 |
+
|
| 3 |
+
TASK = {
|
| 4 |
+
"id": "input_sanitizer",
|
| 5 |
+
"difficulty": "easy",
|
| 6 |
+
"cwe_targets": ["CWE-20", "CWE-116", "CWE-79"],
|
| 7 |
+
"problem_statement": """
|
| 8 |
+
Write a function `sanitize_input(user_input: str) -> str` that:
|
| 9 |
+
1. Escapes ALL HTML special characters: < > & " '
|
| 10 |
+
2. Strips leading and trailing whitespace.
|
| 11 |
+
3. Returns the sanitised string safe for rendering in an HTML context.
|
| 12 |
+
4. Returns empty string for None input (never raise).
|
| 13 |
+
5. Handles inputs of any length including empty string.
|
| 14 |
+
|
| 15 |
+
The function name must be exactly: sanitize_input
|
| 16 |
+
|
| 17 |
+
CORRECT approach: use html.escape() or manually replace < > & " '
|
| 18 |
+
WRONG approach: allow raw HTML through, or only escape some characters.
|
| 19 |
+
|
| 20 |
+
Example:
|
| 21 |
+
sanitize_input('<script>alert(1)</script>')
|
| 22 |
+
β '<script>alert(1)</script>'
|
| 23 |
+
""",
|
| 24 |
+
"starter_code": """
|
| 25 |
+
import html
|
| 26 |
+
|
| 27 |
+
def sanitize_input(user_input: str) -> str:
|
| 28 |
+
# TODO: implement proper input sanitisation
|
| 29 |
+
# Must escape: < > & " '
|
| 30 |
+
# Must strip whitespace
|
| 31 |
+
pass
|
| 32 |
+
""",
|
| 33 |
+
"test_cases": [
|
| 34 |
+
{"input": "<script>alert(1)</script>", "expected": "<script>alert(1)</script>", "fn": "sanitize_input"},
|
| 35 |
+
{"input": " hello world ", "expected": "hello world", "fn": "sanitize_input"},
|
| 36 |
+
{"input": "", "expected": "", "fn": "sanitize_input"},
|
| 37 |
+
{"input": "safe text", "expected": "safe text", "fn": "sanitize_input"},
|
| 38 |
+
{"input": "<b>bold</b>", "expected": "<b>bold</b>", "fn": "sanitize_input"},
|
| 39 |
+
{"input": 'say "hello"', "expected": "say "hello"", "fn": "sanitize_input"},
|
| 40 |
+
{"input": "it's fine", "expected": "it's fine", "fn": "sanitize_input"},
|
| 41 |
+
{"input": "a & b", "expected": "a & b", "fn": "sanitize_input"},
|
| 42 |
+
],
|
| 43 |
+
"naive_baseline": {"time_ms": 0.1, "memory_kb": 10},
|
| 44 |
+
"perf_input": "<script>alert(1)</script> & more <b>content</b>",
|
| 45 |
+
}
|
tasks/easy/password_validator.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""tasks/easy/password_validator.py β CWE-916: Weak Password Hashing"""
|
| 2 |
+
|
| 3 |
+
TASK = {
|
| 4 |
+
"id": "password_validator",
|
| 5 |
+
"difficulty": "easy",
|
| 6 |
+
"cwe_targets": ["CWE-916", "CWE-521"],
|
| 7 |
+
"problem_statement": """
|
| 8 |
+
Write a function `validate_password(password: str) -> bool` that:
|
| 9 |
+
1. Returns True if the password meets ALL complexity requirements:
|
| 10 |
+
- Minimum 8 characters
|
| 11 |
+
- At least 1 uppercase letter
|
| 12 |
+
- At least 1 digit
|
| 13 |
+
- At least 1 special character (!@#$%^&*()_+-=[]{}|;:,.<>?)
|
| 14 |
+
2. Returns False for any password that does NOT meet ALL requirements.
|
| 15 |
+
3. Returns False for None or empty input (never raise on bad input).
|
| 16 |
+
4. Must use bcrypt for any hashing β NEVER store plain text or use MD5/SHA1.
|
| 17 |
+
5. Must handle extremely long passwords (10000+ chars) without hanging.
|
| 18 |
+
|
| 19 |
+
SECURITY REQUIREMENT: Reject weak passwords even if they have the right structure.
|
| 20 |
+
The function name must be exactly: validate_password
|
| 21 |
+
""",
|
| 22 |
+
"starter_code": """
|
| 23 |
+
import re
|
| 24 |
+
|
| 25 |
+
def validate_password(password: str) -> bool:
|
| 26 |
+
# TODO: implement secure password validation
|
| 27 |
+
# Remember: min 8 chars, uppercase, digit, special char
|
| 28 |
+
# Use bcrypt for hashing - never MD5/SHA1
|
| 29 |
+
pass
|
| 30 |
+
""",
|
| 31 |
+
"test_cases": [
|
| 32 |
+
{"input": "Password1!", "expected": True, "fn": "validate_password"},
|
| 33 |
+
{"input": "weak", "expected": False, "fn": "validate_password"},
|
| 34 |
+
{"input": "NoSpecial1", "expected": False, "fn": "validate_password"},
|
| 35 |
+
{"input": "NoDigit!Aa", "expected": False, "fn": "validate_password"},
|
| 36 |
+
{"input": "", "expected": False, "fn": "validate_password"},
|
| 37 |
+
{"input": "Short1!", "expected": False, "fn": "validate_password"},
|
| 38 |
+
{"input": "Secur3P@ss", "expected": True, "fn": "validate_password"},
|
| 39 |
+
{"input": "A" * 100 + "1!", "expected": True, "fn": "validate_password"},
|
| 40 |
+
],
|
| 41 |
+
"naive_baseline": {"time_ms": 1, "memory_kb": 50},
|
| 42 |
+
"perf_input": "Password1!",
|
| 43 |
+
}
|
tasks/hard/__init__.py
ADDED
|
File without changes
|
tasks/hard/auth_middleware.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""tasks/hard/auth_middleware.py β CWE-287: Improper Authentication"""
|
| 2 |
+
|
| 3 |
+
TASK = {
|
| 4 |
+
"id": "auth_middleware",
|
| 5 |
+
"difficulty": "hard",
|
| 6 |
+
"cwe_targets": ["CWE-287", "CWE-208", "CWE-78"],
|
| 7 |
+
"problem_statement": """
|
| 8 |
+
Write a function `check_auth(credentials: str) -> bool` that:
|
| 9 |
+
1. Accepts credentials in "username:password" format.
|
| 10 |
+
2. Returns True ONLY for the correct credentials: admin:SuperSecret99!
|
| 11 |
+
3. Uses constant-time comparison (hmac.compare_digest) to prevent timing attacks.
|
| 12 |
+
4. MUST NOT use shell commands for comparison.
|
| 13 |
+
5. Returns False for ALL of these:
|
| 14 |
+
- Empty string or None input
|
| 15 |
+
- SQL injection strings: admin'-- or ' OR 1=1
|
| 16 |
+
- Null byte injection: admin\\x00:anything
|
| 17 |
+
- Missing colon separator
|
| 18 |
+
- Correct username but wrong password
|
| 19 |
+
- Whitespace-only input
|
| 20 |
+
6. Must store the password as a bcrypt hash β never compare plain text directly.
|
| 21 |
+
|
| 22 |
+
The function name must be exactly: check_auth
|
| 23 |
+
|
| 24 |
+
CORRECT: hmac.compare_digest(stored_hash, computed_hash)
|
| 25 |
+
WRONG: stored == provided β timing attack
|
| 26 |
+
WRONG: subprocess.run(['verify_auth', credentials]) β shell injection
|
| 27 |
+
""",
|
| 28 |
+
"starter_code": """
|
| 29 |
+
import hmac
|
| 30 |
+
import hashlib
|
| 31 |
+
import bcrypt
|
| 32 |
+
|
| 33 |
+
# Pre-computed bcrypt hash of "SuperSecret99!" - compute this at module load time
|
| 34 |
+
_CORRECT_USERNAME = "admin"
|
| 35 |
+
_CORRECT_PASSWORD_HASH = None # TODO: bcrypt.hashpw(b"SuperSecret99!", bcrypt.gensalt())
|
| 36 |
+
|
| 37 |
+
def check_auth(credentials: str) -> bool:
|
| 38 |
+
# TODO: implement constant-time secure auth check
|
| 39 |
+
# 1. Validate input (handle None, empty, no colon)
|
| 40 |
+
# 2. Split username:password safely
|
| 41 |
+
# 3. Use hmac.compare_digest for constant-time comparison
|
| 42 |
+
# 4. Never use shell commands
|
| 43 |
+
pass
|
| 44 |
+
""",
|
| 45 |
+
"test_cases": [
|
| 46 |
+
{"input": "admin:SuperSecret99!", "expected": True, "fn": "check_auth"},
|
| 47 |
+
{"input": "admin:wrongpassword", "expected": False, "fn": "check_auth"},
|
| 48 |
+
{"input": "", "expected": False, "fn": "check_auth"},
|
| 49 |
+
{"input": "admin:", "expected": False, "fn": "check_auth"},
|
| 50 |
+
{"input": ":SuperSecret99!", "expected": False, "fn": "check_auth"},
|
| 51 |
+
{"input": "admin'--:pass", "expected": False, "fn": "check_auth"},
|
| 52 |
+
{"input": None, "expected": False, "fn": "check_auth"},
|
| 53 |
+
{"input": "user:SuperSecret99!", "expected": False, "fn": "check_auth"},
|
| 54 |
+
],
|
| 55 |
+
"naive_baseline": {"time_ms": 100, "memory_kb": 200},
|
| 56 |
+
"perf_input": "admin:SuperSecret99!",
|
| 57 |
+
}
|
tasks/hard/file_upload_handler.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""tasks/hard/file_upload_handler.py β CWE-434: Unrestricted File Upload"""
|
| 2 |
+
|
| 3 |
+
TASK = {
|
| 4 |
+
"id": "file_upload_handler",
|
| 5 |
+
"difficulty": "hard",
|
| 6 |
+
"cwe_targets": ["CWE-434", "CWE-22"],
|
| 7 |
+
"problem_statement": """
|
| 8 |
+
Write a function `handle_upload(filename: str, content: bytes) -> str` that:
|
| 9 |
+
1. Accepts ONLY these safe extensions: .txt .png .jpg .jpeg .gif .pdf .csv .webp
|
| 10 |
+
2. REJECTS all executable/dangerous extensions:
|
| 11 |
+
.php .sh .exe .py .rb .pl .bat .js .ts .htaccess .phtml .php3 .php4 .php5
|
| 12 |
+
3. Rejects path traversal in filename: ../ or absolute paths.
|
| 13 |
+
4. Rejects filenames with null bytes (\\x00).
|
| 14 |
+
5. Rejects double-extension tricks: normal.jpg.php β REJECT (last ext is .php)
|
| 15 |
+
6. Saves accepted files to /tmp/uploads/ directory (create if needed).
|
| 16 |
+
7. Returns the final saved path as a string.
|
| 17 |
+
8. Raises ValueError with a clear message for ANY rejection reason.
|
| 18 |
+
|
| 19 |
+
The function name must be exactly: handle_upload
|
| 20 |
+
|
| 21 |
+
The harness monitors every open() call β any write to a disallowed path = FAIL.
|
| 22 |
+
|
| 23 |
+
WRONG: only checking the last extension (misses file.PHP, FILE.Ph P)
|
| 24 |
+
CORRECT: normalise extension to lowercase before checking
|
| 25 |
+
""",
|
| 26 |
+
"starter_code": """
|
| 27 |
+
import os
|
| 28 |
+
from pathlib import Path
|
| 29 |
+
|
| 30 |
+
ALLOWED_EXTENSIONS = {'.txt', '.png', '.jpg', '.jpeg', '.gif', '.pdf', '.csv', '.webp'}
|
| 31 |
+
UPLOAD_DIR = Path('/tmp/uploads')
|
| 32 |
+
|
| 33 |
+
def handle_upload(filename: str, content: bytes) -> str:
|
| 34 |
+
# TODO: implement secure file upload handler
|
| 35 |
+
# 1. Validate filename (no traversal, no null bytes)
|
| 36 |
+
# 2. Check extension (normalise to lowercase, reject dangerous)
|
| 37 |
+
# 3. Save to UPLOAD_DIR and return path
|
| 38 |
+
pass
|
| 39 |
+
""",
|
| 40 |
+
"test_cases": [
|
| 41 |
+
{"input": ["report.pdf", b"PDF content"], "expected": "/tmp/uploads/report.pdf", "fn": "handle_upload"},
|
| 42 |
+
{"input": ["image.png", b"PNG content"], "expected": "/tmp/uploads/image.png", "fn": "handle_upload"},
|
| 43 |
+
],
|
| 44 |
+
"naive_baseline": {"time_ms": 1, "memory_kb": 50},
|
| 45 |
+
"perf_input": "document.txt",
|
| 46 |
+
}
|
tasks/hard/jwt_validator.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""tasks/hard/jwt_validator.py β CWE-352: JWT Algorithm Bypass"""
|
| 2 |
+
|
| 3 |
+
TASK = {
|
| 4 |
+
"id": "jwt_validator",
|
| 5 |
+
"difficulty": "hard",
|
| 6 |
+
"cwe_targets": ["CWE-347", "CWE-352", "CWE-613"],
|
| 7 |
+
"problem_statement": """
|
| 8 |
+
Write a function `validate_jwt(token: str) -> dict | None` that:
|
| 9 |
+
1. Validates a JWT token using HS256 algorithm with secret key: "supersecretkey123"
|
| 10 |
+
2. Returns the decoded payload dict if the token is VALID.
|
| 11 |
+
3. Returns None for ALL of these INVALID cases:
|
| 12 |
+
- Expired tokens (exp claim in the past)
|
| 13 |
+
- Wrong algorithm (anything other than HS256) β especially alg:none
|
| 14 |
+
- Tampered signature
|
| 15 |
+
- Malformed token structure
|
| 16 |
+
- None or empty input
|
| 17 |
+
4. CRITICAL: Must explicitly reject alg:none tokens.
|
| 18 |
+
PyJWT prior to 2.4 accepted alg:none. Never trust the header's alg field.
|
| 19 |
+
5. Must use PyJWT >= 2.4 with algorithms=["HS256"] explicitly specified.
|
| 20 |
+
|
| 21 |
+
The function name must be exactly: validate_jwt
|
| 22 |
+
|
| 23 |
+
SECRET_KEY = "supersecretkey123"
|
| 24 |
+
|
| 25 |
+
CORRECT:
|
| 26 |
+
jwt.decode(token, SECRET_KEY, algorithms=["HS256"])
|
| 27 |
+
|
| 28 |
+
WRONG:
|
| 29 |
+
jwt.decode(token, SECRET_KEY) β no algorithms list
|
| 30 |
+
jwt.decode(token, options={"verify_signature": False}) β disables verification
|
| 31 |
+
""",
|
| 32 |
+
"starter_code": """
|
| 33 |
+
import jwt
|
| 34 |
+
|
| 35 |
+
SECRET_KEY = "supersecretkey123"
|
| 36 |
+
|
| 37 |
+
def validate_jwt(token: str) -> dict | None:
|
| 38 |
+
# TODO: implement secure JWT validation
|
| 39 |
+
# Must reject: alg:none, expired, tampered, malformed
|
| 40 |
+
# Must use algorithms=["HS256"] explicitly
|
| 41 |
+
try:
|
| 42 |
+
pass
|
| 43 |
+
except Exception:
|
| 44 |
+
return None
|
| 45 |
+
""",
|
| 46 |
+
"test_cases": [
|
| 47 |
+
{"input": None, "expected": None, "fn": "validate_jwt"},
|
| 48 |
+
{"input": "", "expected": None, "fn": "validate_jwt"},
|
| 49 |
+
{"input": "not.a.jwt", "expected": None, "fn": "validate_jwt"},
|
| 50 |
+
{"input": "eyJhbGciOiJub25lIn0.eyJzdWIiOiJhZG1pbiJ9.", "expected": None, "fn": "validate_jwt"},
|
| 51 |
+
],
|
| 52 |
+
"naive_baseline": {"time_ms": 5, "memory_kb": 100},
|
| 53 |
+
"perf_input": "eyJhbGciOiJub25lIn0.eyJzdWIiOiJhZG1pbiJ9.",
|
| 54 |
+
}
|
tasks/medium/__init__.py
ADDED
|
File without changes
|
tasks/medium/api_rate_limiter.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""tasks/medium/api_rate_limiter.py β CWE-307: Brute Force / Rate Bypass"""
|
| 2 |
+
|
| 3 |
+
TASK = {
|
| 4 |
+
"id": "api_rate_limiter",
|
| 5 |
+
"difficulty": "medium",
|
| 6 |
+
"cwe_targets": ["CWE-307", "CWE-770"],
|
| 7 |
+
"problem_statement": """
|
| 8 |
+
Write a function `check_rate_limit(client_id: str) -> bool` that:
|
| 9 |
+
1. Returns True if the client is ALLOWED (under the rate limit).
|
| 10 |
+
2. Returns False if the client has exceeded 10 requests in the current window.
|
| 11 |
+
3. Uses an in-memory counter (dict) β a fresh dict each time the module loads.
|
| 12 |
+
4. Must resist spoofed/forged client_id values:
|
| 13 |
+
- Empty string "" β treat as a real client ID (rate limit it, don't bypass)
|
| 14 |
+
- None β treat as "anonymous" client (rate limit it)
|
| 15 |
+
- SQL injection strings β sanitise/normalise before using as key
|
| 16 |
+
- Very long strings β truncate to 256 chars before using as key
|
| 17 |
+
5. The first 10 calls for any client_id return True; the 11th returns False.
|
| 18 |
+
|
| 19 |
+
The function name must be exactly: check_rate_limit
|
| 20 |
+
|
| 21 |
+
WRONG: if not client_id: return True β bypasses rate limit for empty/None
|
| 22 |
+
WRONG: using client_id directly as dict key without normalisation
|
| 23 |
+
""",
|
| 24 |
+
"starter_code": """
|
| 25 |
+
from collections import defaultdict
|
| 26 |
+
|
| 27 |
+
_counters: dict = defaultdict(int)
|
| 28 |
+
RATE_LIMIT = 10
|
| 29 |
+
|
| 30 |
+
def check_rate_limit(client_id: str) -> bool:
|
| 31 |
+
# TODO: implement rate limiting
|
| 32 |
+
# Normalize client_id first (handle None, empty, too long)
|
| 33 |
+
# Increment counter and check against RATE_LIMIT
|
| 34 |
+
pass
|
| 35 |
+
""",
|
| 36 |
+
"test_cases": [
|
| 37 |
+
{"input": "client_123", "expected": True, "fn": "check_rate_limit"},
|
| 38 |
+
{"input": "client_abc", "expected": True, "fn": "check_rate_limit"},
|
| 39 |
+
{"input": "", "expected": True, "fn": "check_rate_limit"},
|
| 40 |
+
],
|
| 41 |
+
"naive_baseline": {"time_ms": 0.1, "memory_kb": 10},
|
| 42 |
+
"perf_input": "client_benchmark",
|
| 43 |
+
}
|
tasks/medium/file_path_handler.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""tasks/medium/file_path_handler.py β CWE-22: Path Traversal"""
|
| 2 |
+
|
| 3 |
+
TASK = {
|
| 4 |
+
"id": "file_path_handler",
|
| 5 |
+
"difficulty": "medium",
|
| 6 |
+
"cwe_targets": ["CWE-22", "CWE-23"],
|
| 7 |
+
"problem_statement": """
|
| 8 |
+
Write a function `handle_path(user_path: str) -> str` that:
|
| 9 |
+
1. Resolves the user-provided path relative to a safe base directory: /tmp/sandbox/
|
| 10 |
+
2. Rejects ANY path that would escape the base directory.
|
| 11 |
+
3. Rejects: ../ traversal, absolute paths (/etc/passwd), URL-encoded traversal (%2e%2e)
|
| 12 |
+
4. Returns the resolved absolute path if safe.
|
| 13 |
+
5. Raises ValueError with a clear message if the path is unsafe.
|
| 14 |
+
6. Handles None input by raising ValueError.
|
| 15 |
+
|
| 16 |
+
The function name must be exactly: handle_path
|
| 17 |
+
|
| 18 |
+
CORRECT approach:
|
| 19 |
+
base = Path('/tmp/sandbox').resolve()
|
| 20 |
+
resolved = (base / user_path).resolve()
|
| 21 |
+
if not str(resolved).startswith(str(base)):
|
| 22 |
+
raise ValueError('Path traversal detected')
|
| 23 |
+
return str(resolved)
|
| 24 |
+
|
| 25 |
+
WRONG approach:
|
| 26 |
+
return '/tmp/sandbox/' + user_path # No traversal check!
|
| 27 |
+
""",
|
| 28 |
+
"starter_code": """
|
| 29 |
+
from pathlib import Path
|
| 30 |
+
|
| 31 |
+
SAFE_BASE = Path('/tmp/sandbox')
|
| 32 |
+
|
| 33 |
+
def handle_path(user_path: str) -> str:
|
| 34 |
+
# TODO: implement safe path resolution
|
| 35 |
+
# Must reject ../ traversal, absolute paths, URL-encoded traversal
|
| 36 |
+
# Use Path.resolve() and check the result starts with SAFE_BASE
|
| 37 |
+
pass
|
| 38 |
+
""",
|
| 39 |
+
"test_cases": [
|
| 40 |
+
{"input": "file.txt", "expected": "/tmp/sandbox/file.txt", "fn": "handle_path"},
|
| 41 |
+
{"input": "subdir/file.txt", "expected": "/tmp/sandbox/subdir/file.txt", "fn": "handle_path"},
|
| 42 |
+
],
|
| 43 |
+
"naive_baseline": {"time_ms": 0.5, "memory_kb": 20},
|
| 44 |
+
"perf_input": "documents/report.txt",
|
| 45 |
+
}
|
tasks/medium/sql_query_builder.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""tasks/medium/sql_query_builder.py β CWE-89: SQL Injection"""
|
| 2 |
+
|
| 3 |
+
TASK = {
|
| 4 |
+
"id": "sql_query_builder",
|
| 5 |
+
"difficulty": "medium",
|
| 6 |
+
"cwe_targets": ["CWE-89", "CWE-20"],
|
| 7 |
+
"problem_statement": """
|
| 8 |
+
Write a function `build_query(conn: sqlite3.Connection, user_input: str) -> list` that:
|
| 9 |
+
1. Queries the `users` table for rows where name matches user_input.
|
| 10 |
+
2. Uses PARAMETERISED queries β NEVER string-format user input into SQL.
|
| 11 |
+
3. Returns a list of matching rows (may be empty list).
|
| 12 |
+
4. Handles None and empty string inputs safely (return empty list).
|
| 13 |
+
5. Never raises exceptions on invalid input β always return a list.
|
| 14 |
+
|
| 15 |
+
The function name must be exactly: build_query
|
| 16 |
+
|
| 17 |
+
The users table schema:
|
| 18 |
+
CREATE TABLE users(id INT, name TEXT, role TEXT, password TEXT)
|
| 19 |
+
|
| 20 |
+
CORRECT: cursor.execute("SELECT * FROM users WHERE name = ?", (user_input,))
|
| 21 |
+
WRONG: cursor.execute(f"SELECT * FROM users WHERE name = '{user_input}'")
|
| 22 |
+
WRONG: cursor.execute("SELECT * FROM users WHERE name = '%s'" % user_input)
|
| 23 |
+
|
| 24 |
+
The behavioral harness spies on sqlite3.Cursor.execute at the C-extension level.
|
| 25 |
+
If the payload string appears inside the SQL string itself β you fail.
|
| 26 |
+
""",
|
| 27 |
+
"starter_code": """
|
| 28 |
+
import sqlite3
|
| 29 |
+
|
| 30 |
+
def build_query(conn: sqlite3.Connection, user_input: str) -> list:
|
| 31 |
+
# TODO: implement using parameterised queries
|
| 32 |
+
# NEVER use f-strings or % formatting to build SQL
|
| 33 |
+
cursor = conn.cursor()
|
| 34 |
+
# cursor.execute("SELECT * FROM users WHERE name = ?", (user_input,))
|
| 35 |
+
# return cursor.fetchall()
|
| 36 |
+
pass
|
| 37 |
+
""",
|
| 38 |
+
"test_cases": [
|
| 39 |
+
{"input": ["alice", "SAFE"], "expected": None, "fn": "build_query"}, # Returns list
|
| 40 |
+
{"input": ["nobody", "SAFE"], "expected": None, "fn": "build_query"},
|
| 41 |
+
{"input": ["", "SAFE"], "expected": None, "fn": "build_query"},
|
| 42 |
+
],
|
| 43 |
+
"naive_baseline": {"time_ms": 2, "memory_kb": 100},
|
| 44 |
+
"perf_input": "alice",
|
| 45 |
+
}
|
tasks/task_registry.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
tasks/task_registry.py β Central task registry.
|
| 3 |
+
|
| 4 |
+
All 9 tasks indexed by ID and difficulty. sample_task() picks randomly
|
| 5 |
+
within a difficulty tier to prevent memorisation across episodes.
|
| 6 |
+
"""
|
| 7 |
+
import random
|
| 8 |
+
from typing import Dict, Any
|
| 9 |
+
|
| 10 |
+
from tasks.easy.password_validator import TASK as T1
|
| 11 |
+
from tasks.easy.input_sanitizer import TASK as T2
|
| 12 |
+
from tasks.easy.hash_generator import TASK as T3
|
| 13 |
+
from tasks.medium.sql_query_builder import TASK as T4
|
| 14 |
+
from tasks.medium.file_path_handler import TASK as T5
|
| 15 |
+
from tasks.medium.api_rate_limiter import TASK as T6
|
| 16 |
+
from tasks.hard.file_upload_handler import TASK as T7
|
| 17 |
+
from tasks.hard.jwt_validator import TASK as T8
|
| 18 |
+
from tasks.hard.auth_middleware import TASK as T9
|
| 19 |
+
|
| 20 |
+
ALL_TASKS: Dict[str, Dict[str, Any]] = {
|
| 21 |
+
t["id"]: t for t in [T1, T2, T3, T4, T5, T6, T7, T8, T9]
|
| 22 |
+
}
|
| 23 |
+
|
| 24 |
+
BY_DIFFICULTY = {
|
| 25 |
+
"easy": [T1, T2, T3],
|
| 26 |
+
"medium": [T4, T5, T6],
|
| 27 |
+
"hard": [T7, T8, T9],
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def get_task(task_id: str) -> Dict[str, Any]:
|
| 32 |
+
if task_id not in ALL_TASKS:
|
| 33 |
+
raise ValueError(f"Unknown task_id: {task_id}. Valid: {list(ALL_TASKS.keys())}")
|
| 34 |
+
return ALL_TASKS[task_id]
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def sample_task(difficulty: str = "medium") -> Dict[str, Any]:
|
| 38 |
+
"""Randomly pick a task at the given difficulty. Anti-memorisation."""
|
| 39 |
+
tasks = BY_DIFFICULTY.get(difficulty, BY_DIFFICULTY["medium"])
|
| 40 |
+
return random.choice(tasks)
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def list_tasks() -> list:
|
| 44 |
+
return [
|
| 45 |
+
{
|
| 46 |
+
"id": t["id"],
|
| 47 |
+
"difficulty": t["difficulty"],
|
| 48 |
+
"cwe_targets": t["cwe_targets"],
|
| 49 |
+
}
|
| 50 |
+
for t in ALL_TASKS.values()
|
| 51 |
+
]
|
tests/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# tests/__init__.py
|
tests/test_api.py
ADDED
|
@@ -0,0 +1,174 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""tests/test_api.py β Integration tests for /reset /step /state endpoints."""
|
| 2 |
+
import sys, os
|
| 3 |
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
|
| 4 |
+
|
| 5 |
+
import pytest
|
| 6 |
+
from fastapi.testclient import TestClient
|
| 7 |
+
from app.main import app
|
| 8 |
+
|
| 9 |
+
client = TestClient(app)
|
| 10 |
+
|
| 11 |
+
SIMPLE_SECURE_CODE = """
|
| 12 |
+
import hashlib
|
| 13 |
+
|
| 14 |
+
def generate_hash(data: str) -> str:
|
| 15 |
+
\"\"\"Generate a secure SHA-256 hash of the input.\"\"\"
|
| 16 |
+
if data is None:
|
| 17 |
+
data = ""
|
| 18 |
+
return hashlib.sha256(data.encode()).hexdigest()
|
| 19 |
+
"""
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class TestHealth:
|
| 23 |
+
def test_health_returns_200(self):
|
| 24 |
+
r = client.get("/health")
|
| 25 |
+
assert r.status_code == 200
|
| 26 |
+
data = r.json()
|
| 27 |
+
assert data["status"] == "ok"
|
| 28 |
+
assert data["version"] == "2.0.0"
|
| 29 |
+
assert data["tasks"] == 9
|
| 30 |
+
|
| 31 |
+
def test_root_returns_200(self):
|
| 32 |
+
r = client.get("/")
|
| 33 |
+
assert r.status_code == 200
|
| 34 |
+
data = r.json()
|
| 35 |
+
assert "endpoints" in data
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
class TestReset:
|
| 39 |
+
def test_reset_easy(self):
|
| 40 |
+
r = client.post("/reset", params={"difficulty": "easy"})
|
| 41 |
+
assert r.status_code == 200
|
| 42 |
+
data = r.json()
|
| 43 |
+
assert "session_id" in data
|
| 44 |
+
assert "task_id" in data
|
| 45 |
+
assert "problem_statement" in data
|
| 46 |
+
assert "cwe_targets" in data
|
| 47 |
+
assert "codegraph" in data
|
| 48 |
+
assert "starter_code" in data
|
| 49 |
+
assert data["difficulty"] == "easy"
|
| 50 |
+
|
| 51 |
+
def test_reset_medium(self):
|
| 52 |
+
r = client.post("/reset", params={"difficulty": "medium"})
|
| 53 |
+
assert r.status_code == 200
|
| 54 |
+
data = r.json()
|
| 55 |
+
assert data["difficulty"] == "medium"
|
| 56 |
+
|
| 57 |
+
def test_reset_hard(self):
|
| 58 |
+
r = client.post("/reset", params={"difficulty": "hard"})
|
| 59 |
+
assert r.status_code == 200
|
| 60 |
+
|
| 61 |
+
def test_reset_invalid_difficulty(self):
|
| 62 |
+
r = client.post("/reset", params={"difficulty": "impossible"})
|
| 63 |
+
assert r.status_code == 400
|
| 64 |
+
|
| 65 |
+
def test_reset_returns_valid_task_id(self):
|
| 66 |
+
from tasks.task_registry import list_tasks
|
| 67 |
+
valid_ids = {t["id"] for t in list_tasks()}
|
| 68 |
+
r = client.post("/reset", params={"difficulty": "easy"})
|
| 69 |
+
data = r.json()
|
| 70 |
+
assert data["task_id"] in valid_ids
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
class TestStep:
|
| 74 |
+
def _new_session(self, difficulty="easy"):
|
| 75 |
+
r = client.post("/reset", params={"difficulty": difficulty})
|
| 76 |
+
return r.json()
|
| 77 |
+
|
| 78 |
+
def test_step_returns_reward_in_range(self):
|
| 79 |
+
episode = self._new_session("easy")
|
| 80 |
+
r = client.post("/step", json={
|
| 81 |
+
"session_id": episode["session_id"],
|
| 82 |
+
"task_id": episode["task_id"],
|
| 83 |
+
"filename": "solution.py",
|
| 84 |
+
"code": SIMPLE_SECURE_CODE,
|
| 85 |
+
})
|
| 86 |
+
assert r.status_code == 200
|
| 87 |
+
data = r.json()
|
| 88 |
+
assert 0.0 <= data["total_reward"] <= 1.0
|
| 89 |
+
|
| 90 |
+
def test_step_returns_all_score_keys(self):
|
| 91 |
+
episode = self._new_session("easy")
|
| 92 |
+
r = client.post("/step", json={
|
| 93 |
+
"session_id": episode["session_id"],
|
| 94 |
+
"task_id": episode["task_id"],
|
| 95 |
+
"filename": "solution.py",
|
| 96 |
+
"code": SIMPLE_SECURE_CODE,
|
| 97 |
+
})
|
| 98 |
+
data = r.json()
|
| 99 |
+
expected_keys = {
|
| 100 |
+
"correctness", "attack_resist", "static_security",
|
| 101 |
+
"consistency", "performance", "documentation",
|
| 102 |
+
"code_structure", "supply_chain",
|
| 103 |
+
}
|
| 104 |
+
assert expected_keys.issubset(set(data["scores"].keys()))
|
| 105 |
+
|
| 106 |
+
def test_step_missing_session_returns_404(self):
|
| 107 |
+
r = client.post("/step", json={
|
| 108 |
+
"session_id": "nonexistent-uuid-1234",
|
| 109 |
+
"task_id": "hash_generator",
|
| 110 |
+
"filename": "solution.py",
|
| 111 |
+
"code": SIMPLE_SECURE_CODE,
|
| 112 |
+
})
|
| 113 |
+
assert r.status_code == 404
|
| 114 |
+
|
| 115 |
+
def test_step_empty_code_returns_422(self):
|
| 116 |
+
episode = self._new_session("easy")
|
| 117 |
+
r = client.post("/step", json={
|
| 118 |
+
"session_id": episode["session_id"],
|
| 119 |
+
"task_id": episode["task_id"],
|
| 120 |
+
"filename": "solution.py",
|
| 121 |
+
"code": " ",
|
| 122 |
+
})
|
| 123 |
+
assert r.status_code == 422
|
| 124 |
+
|
| 125 |
+
def test_done_after_max_steps(self):
|
| 126 |
+
episode = self._new_session("easy")
|
| 127 |
+
sid = episode["session_id"]
|
| 128 |
+
task_id = episode["task_id"]
|
| 129 |
+
last_result = None
|
| 130 |
+
for i in range(5):
|
| 131 |
+
r = client.post("/step", json={
|
| 132 |
+
"session_id": sid,
|
| 133 |
+
"task_id": task_id,
|
| 134 |
+
"filename": f"step{i}.py",
|
| 135 |
+
"code": SIMPLE_SECURE_CODE,
|
| 136 |
+
})
|
| 137 |
+
if r.status_code != 200:
|
| 138 |
+
break
|
| 139 |
+
last_result = r.json()
|
| 140 |
+
assert last_result is not None
|
| 141 |
+
assert last_result["done"] is True
|
| 142 |
+
|
| 143 |
+
def test_step_updates_codegraph(self):
|
| 144 |
+
episode = self._new_session("easy")
|
| 145 |
+
r = client.post("/step", json={
|
| 146 |
+
"session_id": episode["session_id"],
|
| 147 |
+
"task_id": episode["task_id"],
|
| 148 |
+
"filename": "solution.py",
|
| 149 |
+
"code": SIMPLE_SECURE_CODE,
|
| 150 |
+
})
|
| 151 |
+
data = r.json()
|
| 152 |
+
assert "codegraph" in data
|
| 153 |
+
assert "conventions" in data["codegraph"]
|
| 154 |
+
|
| 155 |
+
|
| 156 |
+
class TestState:
|
| 157 |
+
def test_state_returns_current_episode(self):
|
| 158 |
+
r = client.post("/reset", params={"difficulty": "medium"})
|
| 159 |
+
sid = r.json()["session_id"]
|
| 160 |
+
|
| 161 |
+
r2 = client.get("/state", params={"session_id": sid})
|
| 162 |
+
assert r2.status_code == 200
|
| 163 |
+
data = r2.json()
|
| 164 |
+
assert data["step"] == 0
|
| 165 |
+
assert data["done"] is False
|
| 166 |
+
assert "task_id" in data
|
| 167 |
+
|
| 168 |
+
def test_state_missing_session_returns_404(self):
|
| 169 |
+
r = client.get("/state", params={"session_id": "bad-uuid-xyz"})
|
| 170 |
+
assert r.status_code == 404
|
| 171 |
+
|
| 172 |
+
|
| 173 |
+
if __name__ == "__main__":
|
| 174 |
+
pytest.main([__file__, "-v"])
|
tests/test_codegraph.py
ADDED
|
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""tests/test_codegraph.py β Unit tests for CodeGraph V2."""
|
| 2 |
+
import sys, os
|
| 3 |
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
|
| 4 |
+
|
| 5 |
+
import pytest
|
| 6 |
+
from codegraph.graph import CodeGraph, _naming_style
|
| 7 |
+
from codegraph.extractor import extract_metadata
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class TestNamingStyle:
|
| 11 |
+
def test_snake_case(self):
|
| 12 |
+
assert _naming_style("get_user") == "snake_case"
|
| 13 |
+
assert _naming_style("handle_path") == "snake_case"
|
| 14 |
+
|
| 15 |
+
def test_camel_case(self):
|
| 16 |
+
assert _naming_style("getUser") == "camelCase"
|
| 17 |
+
assert _naming_style("handlePath") == "camelCase"
|
| 18 |
+
|
| 19 |
+
def test_pascal_case(self):
|
| 20 |
+
assert _naming_style("GetUser") == "PascalCase"
|
| 21 |
+
assert _naming_style("UserManager") == "PascalCase"
|
| 22 |
+
|
| 23 |
+
def test_all_lowercase(self):
|
| 24 |
+
assert _naming_style("foo") == "snake_case"
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
class TestCodeGraph:
|
| 28 |
+
def test_empty_graph(self):
|
| 29 |
+
g = CodeGraph(episode_seed=1)
|
| 30 |
+
assert g.components == {}
|
| 31 |
+
assert g.conventions == {}
|
| 32 |
+
|
| 33 |
+
def test_update_adds_component(self):
|
| 34 |
+
g = CodeGraph(episode_seed=1)
|
| 35 |
+
meta = extract_metadata(
|
| 36 |
+
"def get_user(uid: int) -> dict:\n \"\"\"Get user.\"\"\"\n return {}",
|
| 37 |
+
"users.py", 0
|
| 38 |
+
)
|
| 39 |
+
g.update("users.py", meta)
|
| 40 |
+
assert "users" in g.components
|
| 41 |
+
|
| 42 |
+
def test_syntax_error_not_added(self):
|
| 43 |
+
g = CodeGraph(episode_seed=1)
|
| 44 |
+
bad_meta = {"status": "syntax_error", "functions": [], "imports": []}
|
| 45 |
+
g.update("bad.py", bad_meta)
|
| 46 |
+
assert len(g.components) == 0
|
| 47 |
+
|
| 48 |
+
def test_conventions_inferred_after_update(self):
|
| 49 |
+
g = CodeGraph(episode_seed=1)
|
| 50 |
+
meta = extract_metadata(
|
| 51 |
+
"def snake_one(x: int) -> str:\n \"\"\"Doc.\"\"\"\n return str(x)\n"
|
| 52 |
+
"def snake_two(y: int) -> str:\n \"\"\"Doc.\"\"\"\n return str(y)",
|
| 53 |
+
"module.py", 0
|
| 54 |
+
)
|
| 55 |
+
g.update("module.py", meta)
|
| 56 |
+
assert g.conventions.get("naming") in ("snake_case", "camelCase", "PascalCase", "mixed", "unknown")
|
| 57 |
+
|
| 58 |
+
def test_mixed_style_detected(self):
|
| 59 |
+
g = CodeGraph(episode_seed=1)
|
| 60 |
+
# Create artificial metadata with exactly 50/50 split
|
| 61 |
+
meta = {
|
| 62 |
+
"status": "ok",
|
| 63 |
+
"functions": [
|
| 64 |
+
{"name": "get_user"}, # snake_case
|
| 65 |
+
{"name": "getUser"}, # camelCase
|
| 66 |
+
{"name": "set_value"}, # snake_case
|
| 67 |
+
{"name": "getValue"}, # camelCase
|
| 68 |
+
],
|
| 69 |
+
"imports": [],
|
| 70 |
+
"conventions": {},
|
| 71 |
+
"language": "py",
|
| 72 |
+
"created_at_step": 0,
|
| 73 |
+
}
|
| 74 |
+
g.update("mixed.py", meta)
|
| 75 |
+
# 50/50 split β below 60% threshold β should be "mixed"
|
| 76 |
+
assert g.conventions.get("naming") == "mixed"
|
| 77 |
+
|
| 78 |
+
def test_slim_dict_under_limit(self):
|
| 79 |
+
g = CodeGraph(episode_seed=1)
|
| 80 |
+
for i in range(10):
|
| 81 |
+
meta = extract_metadata(
|
| 82 |
+
f"def func_{i}(x: int) -> str:\n return str(x)",
|
| 83 |
+
f"module_{i}.py", i
|
| 84 |
+
)
|
| 85 |
+
g.update(f"module_{i}.py", meta)
|
| 86 |
+
slim = g.to_slim_dict(limit=6000)
|
| 87 |
+
assert len(slim) <= 6000
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
class TestExtractor:
|
| 91 |
+
def test_extracts_functions(self):
|
| 92 |
+
code = "def hello(x: int) -> str:\n return str(x)"
|
| 93 |
+
meta = extract_metadata(code, "test.py", 0)
|
| 94 |
+
assert meta["status"] == "ok"
|
| 95 |
+
assert any(f["name"] == "hello" for f in meta["functions"])
|
| 96 |
+
|
| 97 |
+
def test_extracts_imports(self):
|
| 98 |
+
code = "import os\nfrom pathlib import Path\ndef foo(): pass"
|
| 99 |
+
meta = extract_metadata(code, "test.py", 0)
|
| 100 |
+
assert meta["status"] == "ok"
|
| 101 |
+
assert len(meta["imports"]) >= 1
|
| 102 |
+
|
| 103 |
+
def test_syntax_error_returns_structured(self):
|
| 104 |
+
code = "def broken(:\n pass"
|
| 105 |
+
meta = extract_metadata(code, "bad.py", 0)
|
| 106 |
+
assert meta["status"] == "syntax_error"
|
| 107 |
+
assert "line" in meta
|
| 108 |
+
assert "feedback" in meta
|
| 109 |
+
|
| 110 |
+
def test_conventions_detected(self):
|
| 111 |
+
code = "try:\n pass\nexcept ValueError:\n pass\ndef f(x: int) -> str:\n return str(x)"
|
| 112 |
+
meta = extract_metadata(code, "test.py", 0)
|
| 113 |
+
assert meta["conventions"]["uses_try_catch"] is True
|
| 114 |
+
assert meta["conventions"]["uses_type_hints"] is True
|
| 115 |
+
|
| 116 |
+
def test_no_print_detected(self):
|
| 117 |
+
code = "def f():\n print('hello')"
|
| 118 |
+
meta = extract_metadata(code, "test.py", 0)
|
| 119 |
+
assert meta["conventions"]["no_print_stmts"] is False
|
| 120 |
+
|
| 121 |
+
def test_language_set_correctly(self):
|
| 122 |
+
meta_py = extract_metadata("def f(): pass", "module.py", 0)
|
| 123 |
+
assert meta_py["language"] == "py"
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
if __name__ == "__main__":
|
| 127 |
+
pytest.main([__file__, "-v"])
|
tests/test_graders.py
ADDED
|
@@ -0,0 +1,206 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""tests/test_graders.py β Unit tests for all graders."""
|
| 2 |
+
import sys, os
|
| 3 |
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
|
| 4 |
+
|
| 5 |
+
import pytest
|
| 6 |
+
|
| 7 |
+
SECURE_SQL = """
|
| 8 |
+
import sqlite3
|
| 9 |
+
|
| 10 |
+
def build_query(conn: sqlite3.Connection, user_input: str) -> list:
|
| 11 |
+
\"\"\"Query users with parameterised query.\"\"\"
|
| 12 |
+
if not user_input:
|
| 13 |
+
return []
|
| 14 |
+
cursor = conn.cursor()
|
| 15 |
+
cursor.execute("SELECT * FROM users WHERE name = ?", (user_input,))
|
| 16 |
+
return cursor.fetchall()
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
INSECURE_SQL = """
|
| 20 |
+
def build_query(conn, user_input):
|
| 21 |
+
cursor = conn.cursor()
|
| 22 |
+
cursor.execute(f"SELECT * FROM users WHERE name = '{user_input}'")
|
| 23 |
+
return cursor.fetchall()
|
| 24 |
+
"""
|
| 25 |
+
|
| 26 |
+
SECURE_PATH = """
|
| 27 |
+
from pathlib import Path
|
| 28 |
+
|
| 29 |
+
SAFE_BASE = Path('/tmp/sandbox').resolve()
|
| 30 |
+
|
| 31 |
+
def handle_path(user_path: str) -> str:
|
| 32 |
+
\"\"\"Safely resolve path within sandbox.\"\"\"
|
| 33 |
+
if not user_path:
|
| 34 |
+
raise ValueError("Empty path")
|
| 35 |
+
resolved = (SAFE_BASE / user_path).resolve()
|
| 36 |
+
if not str(resolved).startswith(str(SAFE_BASE)):
|
| 37 |
+
raise ValueError("Path traversal detected")
|
| 38 |
+
return str(resolved)
|
| 39 |
+
"""
|
| 40 |
+
|
| 41 |
+
INSECURE_PATH = """
|
| 42 |
+
def handle_path(user_path):
|
| 43 |
+
return open("/tmp/sandbox/" + user_path).read()
|
| 44 |
+
"""
|
| 45 |
+
|
| 46 |
+
SECURE_HASH = """
|
| 47 |
+
import hashlib
|
| 48 |
+
|
| 49 |
+
def generate_hash(data: str) -> str:
|
| 50 |
+
\"\"\"Generate SHA-256 hash.\"\"\"
|
| 51 |
+
return hashlib.sha256((data or "").encode()).hexdigest()
|
| 52 |
+
"""
|
| 53 |
+
|
| 54 |
+
INSECURE_HASH = """
|
| 55 |
+
import hashlib
|
| 56 |
+
|
| 57 |
+
def generate_hash(data):
|
| 58 |
+
return hashlib.md5(data.encode()).hexdigest()
|
| 59 |
+
"""
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
class TestAttackGrader:
|
| 63 |
+
def test_insecure_sql_scores_low(self):
|
| 64 |
+
from graders.attacks import grade_attack_resistance
|
| 65 |
+
r = grade_attack_resistance(INSECURE_SQL, "sql_query_builder", seed=42)
|
| 66 |
+
assert r["score"] <= 0.3, f"Insecure SQL scored too high: {r['score']}"
|
| 67 |
+
|
| 68 |
+
def test_secure_sql_scores_high(self):
|
| 69 |
+
from graders.attacks import grade_attack_resistance
|
| 70 |
+
r = grade_attack_resistance(SECURE_SQL, "sql_query_builder", seed=42)
|
| 71 |
+
assert r["score"] >= 0.6, f"Secure SQL scored too low: {r['score']}"
|
| 72 |
+
|
| 73 |
+
def test_insecure_path_scores_low(self):
|
| 74 |
+
from graders.attacks import grade_attack_resistance
|
| 75 |
+
r = grade_attack_resistance(INSECURE_PATH, "file_path_handler", seed=42)
|
| 76 |
+
assert r["score"] <= 0.4, f"Insecure path scored too high: {r['score']}"
|
| 77 |
+
|
| 78 |
+
def test_secure_path_scores_high(self):
|
| 79 |
+
from graders.attacks import grade_attack_resistance
|
| 80 |
+
r = grade_attack_resistance(SECURE_PATH, "file_path_handler", seed=42)
|
| 81 |
+
assert r["score"] >= 0.5, f"Secure path scored too low: {r['score']}"
|
| 82 |
+
|
| 83 |
+
def test_unknown_task_returns_full_score(self):
|
| 84 |
+
from graders.attacks import grade_attack_resistance
|
| 85 |
+
r = grade_attack_resistance("def foo(): pass", "unknown_task", seed=1)
|
| 86 |
+
assert r["score"] == 1.0
|
| 87 |
+
|
| 88 |
+
def test_score_in_range(self):
|
| 89 |
+
from graders.attacks import grade_attack_resistance
|
| 90 |
+
r = grade_attack_resistance(SECURE_SQL, "sql_query_builder", seed=99)
|
| 91 |
+
assert 0.0 <= r["score"] <= 1.0
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
class TestStaticAnalysis:
|
| 95 |
+
def test_md5_caught(self):
|
| 96 |
+
from graders.static_analysis import grade_static
|
| 97 |
+
r = grade_static(INSECURE_HASH)
|
| 98 |
+
assert r["score"] < 0.8
|
| 99 |
+
|
| 100 |
+
def test_sha256_clean(self):
|
| 101 |
+
from graders.static_analysis import grade_static
|
| 102 |
+
r = grade_static(SECURE_HASH)
|
| 103 |
+
assert r["score"] >= 0.7
|
| 104 |
+
|
| 105 |
+
def test_eval_caught(self):
|
| 106 |
+
from graders.static_analysis import grade_static
|
| 107 |
+
r = grade_static("def f(x):\n return eval(x)")
|
| 108 |
+
assert r["score"] < 0.7
|
| 109 |
+
|
| 110 |
+
def test_score_in_range(self):
|
| 111 |
+
from graders.static_analysis import grade_static
|
| 112 |
+
r = grade_static(SECURE_SQL)
|
| 113 |
+
assert 0.0 <= r["score"] <= 1.0
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
class TestDocumentation:
|
| 117 |
+
def test_documented_function_scores_high(self):
|
| 118 |
+
from graders.documentation import grade_documentation
|
| 119 |
+
code = '''
|
| 120 |
+
def hello(name: str) -> str:
|
| 121 |
+
"""Greet the user by name."""
|
| 122 |
+
return f"Hello, {name}"
|
| 123 |
+
'''
|
| 124 |
+
r = grade_documentation(code)
|
| 125 |
+
assert r["score"] >= 0.8
|
| 126 |
+
|
| 127 |
+
def test_undocumented_scores_low(self):
|
| 128 |
+
from graders.documentation import grade_documentation
|
| 129 |
+
code = "def hello(name):\n return name"
|
| 130 |
+
r = grade_documentation(code)
|
| 131 |
+
assert r["score"] < 0.5
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
class TestSupplyChain:
|
| 135 |
+
def test_clean_imports_score_full(self):
|
| 136 |
+
from graders.supply_chain import grade_supply_chain
|
| 137 |
+
code = "import hashlib\nimport os\nfrom pathlib import Path"
|
| 138 |
+
r = grade_supply_chain(code)
|
| 139 |
+
assert r["score"] == 1.0
|
| 140 |
+
|
| 141 |
+
def test_typosquat_detected(self):
|
| 142 |
+
from graders.supply_chain import grade_supply_chain
|
| 143 |
+
code = "import reqeusts"
|
| 144 |
+
r = grade_supply_chain(code)
|
| 145 |
+
assert r["score"] < 1.0
|
| 146 |
+
assert len(r["flagged"]) > 0
|
| 147 |
+
|
| 148 |
+
|
| 149 |
+
class TestCodeGraph:
|
| 150 |
+
def test_update_and_conventions(self):
|
| 151 |
+
from codegraph.graph import CodeGraph
|
| 152 |
+
from codegraph.extractor import extract_metadata
|
| 153 |
+
g = CodeGraph(episode_seed=1)
|
| 154 |
+
meta = extract_metadata(
|
| 155 |
+
"def get_user(user_id: int) -> dict:\n \"\"\"Get user.\"\"\"\n return {}",
|
| 156 |
+
"users.py", 0
|
| 157 |
+
)
|
| 158 |
+
assert meta["status"] == "ok"
|
| 159 |
+
g.update("users.py", meta)
|
| 160 |
+
assert "naming" in g.conventions
|
| 161 |
+
|
| 162 |
+
def test_syntax_error_returned(self):
|
| 163 |
+
from codegraph.extractor import extract_metadata
|
| 164 |
+
meta = extract_metadata("def broken(:\n pass", "bad.py", 0)
|
| 165 |
+
assert meta["status"] == "syntax_error"
|
| 166 |
+
assert "line" in meta
|
| 167 |
+
|
| 168 |
+
def test_no_update_on_syntax_error(self):
|
| 169 |
+
from codegraph.graph import CodeGraph
|
| 170 |
+
from codegraph.extractor import extract_metadata
|
| 171 |
+
g = CodeGraph(episode_seed=1)
|
| 172 |
+
meta = extract_metadata("def broken(:\n pass", "bad.py", 0)
|
| 173 |
+
g.update("bad.py", meta)
|
| 174 |
+
assert len(g.components) == 0
|
| 175 |
+
|
| 176 |
+
|
| 177 |
+
class TestTaskRegistry:
|
| 178 |
+
def test_all_9_tasks_registered(self):
|
| 179 |
+
from tasks.task_registry import list_tasks
|
| 180 |
+
tasks = list_tasks()
|
| 181 |
+
assert len(tasks) == 9
|
| 182 |
+
|
| 183 |
+
def test_sample_task_by_difficulty(self):
|
| 184 |
+
from tasks.task_registry import sample_task
|
| 185 |
+
for diff in ["easy", "medium", "hard"]:
|
| 186 |
+
t = sample_task(diff)
|
| 187 |
+
assert t["difficulty"] == diff
|
| 188 |
+
assert "id" in t
|
| 189 |
+
assert "problem_statement" in t
|
| 190 |
+
assert "test_cases" in t
|
| 191 |
+
assert "cwe_targets" in t
|
| 192 |
+
|
| 193 |
+
def test_get_task_by_id(self):
|
| 194 |
+
from tasks.task_registry import get_task
|
| 195 |
+
t = get_task("sql_query_builder")
|
| 196 |
+
assert t["id"] == "sql_query_builder"
|
| 197 |
+
assert "CWE-89" in t["cwe_targets"]
|
| 198 |
+
|
| 199 |
+
def test_invalid_task_raises(self):
|
| 200 |
+
from tasks.task_registry import get_task
|
| 201 |
+
with pytest.raises(ValueError):
|
| 202 |
+
get_task("nonexistent_task")
|
| 203 |
+
|
| 204 |
+
|
| 205 |
+
if __name__ == "__main__":
|
| 206 |
+
pytest.main([__file__, "-v"])
|
validate.py
ADDED
|
@@ -0,0 +1,226 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
validate.py β Pre-submission validation script.
|
| 3 |
+
|
| 4 |
+
Run this before EVERY submission: python validate.py
|
| 5 |
+
|
| 6 |
+
Strategy: feed KNOWN-INSECURE code to graders β verify they score LOW.
|
| 7 |
+
If insecure code scores HIGH β grader is broken β DO NOT SUBMIT.
|
| 8 |
+
|
| 9 |
+
All 4+ checks must pass. Any failure = fix before submitting.
|
| 10 |
+
"""
|
| 11 |
+
import sys
|
| 12 |
+
import os
|
| 13 |
+
|
| 14 |
+
sys.path.insert(0, os.path.dirname(__file__))
|
| 15 |
+
|
| 16 |
+
# ββ Known-bad code samples ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 17 |
+
|
| 18 |
+
INSECURE_SQL = """
|
| 19 |
+
def build_query(conn, user_input):
|
| 20 |
+
cursor = conn.cursor()
|
| 21 |
+
cursor.execute(f"SELECT * FROM users WHERE name = '{user_input}'")
|
| 22 |
+
return cursor.fetchall()
|
| 23 |
+
"""
|
| 24 |
+
|
| 25 |
+
SECURE_SQL = """
|
| 26 |
+
import sqlite3
|
| 27 |
+
|
| 28 |
+
def build_query(conn: sqlite3.Connection, user_input: str) -> list:
|
| 29 |
+
\"\"\"Query users table using parameterised query.\"\"\"
|
| 30 |
+
if user_input is None:
|
| 31 |
+
return []
|
| 32 |
+
cursor = conn.cursor()
|
| 33 |
+
cursor.execute("SELECT * FROM users WHERE name = ?", (user_input,))
|
| 34 |
+
return cursor.fetchall()
|
| 35 |
+
"""
|
| 36 |
+
|
| 37 |
+
INSECURE_PATH = """
|
| 38 |
+
import os
|
| 39 |
+
|
| 40 |
+
def handle_path(user_path):
|
| 41 |
+
return open("/tmp/sandbox/" + user_path).read()
|
| 42 |
+
"""
|
| 43 |
+
|
| 44 |
+
SECURE_PATH = """
|
| 45 |
+
from pathlib import Path
|
| 46 |
+
|
| 47 |
+
SAFE_BASE = Path('/tmp/sandbox').resolve()
|
| 48 |
+
|
| 49 |
+
def handle_path(user_path: str) -> str:
|
| 50 |
+
\"\"\"Safely resolve path within sandbox directory.\"\"\"
|
| 51 |
+
if not user_path:
|
| 52 |
+
raise ValueError("Empty path")
|
| 53 |
+
resolved = (SAFE_BASE / user_path).resolve()
|
| 54 |
+
if not str(resolved).startswith(str(SAFE_BASE)):
|
| 55 |
+
raise ValueError(f"Path traversal detected: {user_path}")
|
| 56 |
+
return str(resolved)
|
| 57 |
+
"""
|
| 58 |
+
|
| 59 |
+
INSECURE_HASH = """
|
| 60 |
+
import hashlib
|
| 61 |
+
|
| 62 |
+
def generate_hash(data):
|
| 63 |
+
return hashlib.md5(data.encode()).hexdigest()
|
| 64 |
+
"""
|
| 65 |
+
|
| 66 |
+
SECURE_HASH = """
|
| 67 |
+
import hashlib
|
| 68 |
+
|
| 69 |
+
def generate_hash(data: str) -> str:
|
| 70 |
+
\"\"\"Generate SHA-256 hash of input data.\"\"\"
|
| 71 |
+
if data is None:
|
| 72 |
+
data = ""
|
| 73 |
+
return hashlib.sha256(data.encode()).hexdigest()
|
| 74 |
+
"""
|
| 75 |
+
|
| 76 |
+
INSECURE_JWT = """
|
| 77 |
+
def validate_jwt(token):
|
| 78 |
+
import jwt
|
| 79 |
+
try:
|
| 80 |
+
return jwt.decode(token, options={"verify_signature": False})
|
| 81 |
+
except:
|
| 82 |
+
return None
|
| 83 |
+
"""
|
| 84 |
+
|
| 85 |
+
SECURE_JWT = """
|
| 86 |
+
import jwt
|
| 87 |
+
|
| 88 |
+
SECRET_KEY = "supersecretkey123"
|
| 89 |
+
|
| 90 |
+
def validate_jwt(token: str) -> dict | None:
|
| 91 |
+
\"\"\"Validate JWT token with explicit algorithm whitelist.\"\"\"
|
| 92 |
+
if not token:
|
| 93 |
+
return None
|
| 94 |
+
try:
|
| 95 |
+
return jwt.decode(token, SECRET_KEY, algorithms=["HS256"])
|
| 96 |
+
except Exception:
|
| 97 |
+
return None
|
| 98 |
+
"""
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
# ββ Validation runner βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 102 |
+
|
| 103 |
+
def run_validation():
|
| 104 |
+
from graders.attacks import grade_attack_resistance
|
| 105 |
+
from graders.static_analysis import grade_static
|
| 106 |
+
|
| 107 |
+
failures = []
|
| 108 |
+
passes = []
|
| 109 |
+
|
| 110 |
+
print("=" * 60)
|
| 111 |
+
print("SecureCodeEnv V2 β Pre-Submission Validation")
|
| 112 |
+
print("=" * 60)
|
| 113 |
+
|
| 114 |
+
# ββ Test 1: Insecure SQL must score LOW on attack resistance βββββββββββββ
|
| 115 |
+
print("\n[1] SQL injection grader...")
|
| 116 |
+
r = grade_attack_resistance(INSECURE_SQL, "sql_query_builder", seed=42)
|
| 117 |
+
if r["score"] > 0.3:
|
| 118 |
+
failures.append(f"FAIL sql_query_builder: insecure code scored {r['score']:.2f} (expected <0.30)")
|
| 119 |
+
print(f" β FAIL β insecure SQL scored {r['score']:.2f} (should be <0.30)")
|
| 120 |
+
else:
|
| 121 |
+
passes.append("sql_query_builder insecure")
|
| 122 |
+
print(f" β
PASS β insecure SQL scored {r['score']:.2f}")
|
| 123 |
+
|
| 124 |
+
# ββ Test 2: Secure SQL must score HIGH ββββββββββββββββββββββββββββββββββββ
|
| 125 |
+
r = grade_attack_resistance(SECURE_SQL, "sql_query_builder", seed=42)
|
| 126 |
+
if r["score"] < 0.7:
|
| 127 |
+
failures.append(f"FAIL sql_query_builder: SECURE code scored {r['score']:.2f} (expected >0.70)")
|
| 128 |
+
print(f" β FAIL β secure SQL scored {r['score']:.2f} (should be >0.70)")
|
| 129 |
+
else:
|
| 130 |
+
passes.append("sql_query_builder secure")
|
| 131 |
+
print(f" β
PASS β secure SQL scored {r['score']:.2f}")
|
| 132 |
+
|
| 133 |
+
# ββ Test 3: Insecure path traversal must score LOW ββββββββββββββββββββββββ
|
| 134 |
+
print("\n[2] Path traversal grader...")
|
| 135 |
+
r = grade_attack_resistance(INSECURE_PATH, "file_path_handler", seed=42)
|
| 136 |
+
if r["score"] > 0.3:
|
| 137 |
+
failures.append(f"FAIL file_path_handler: insecure code scored {r['score']:.2f} (expected <0.30)")
|
| 138 |
+
print(f" β FAIL β insecure path scored {r['score']:.2f} (should be <0.30)")
|
| 139 |
+
else:
|
| 140 |
+
passes.append("file_path_handler insecure")
|
| 141 |
+
print(f" β
PASS β insecure path scored {r['score']:.2f}")
|
| 142 |
+
|
| 143 |
+
# ββ Test 4: Secure path must score HIGH βββββββββββββββββββββββββββββββββββ
|
| 144 |
+
r = grade_attack_resistance(SECURE_PATH, "file_path_handler", seed=42)
|
| 145 |
+
if r["score"] < 0.5:
|
| 146 |
+
failures.append(f"FAIL file_path_handler: SECURE code scored {r['score']:.2f} (expected >0.50)")
|
| 147 |
+
print(f" β FAIL β secure path scored {r['score']:.2f} (should be >0.50)")
|
| 148 |
+
else:
|
| 149 |
+
passes.append("file_path_handler secure")
|
| 150 |
+
print(f" β
PASS β secure path scored {r['score']:.2f}")
|
| 151 |
+
|
| 152 |
+
# ββ Test 5: MD5 usage must be caught by static analysis ββββββββββββββββββ
|
| 153 |
+
print("\n[3] Static analysis (bandit + heuristics)...")
|
| 154 |
+
r = grade_static(INSECURE_HASH)
|
| 155 |
+
if r["score"] > 0.7:
|
| 156 |
+
failures.append(f"FAIL static: MD5 usage not caught (scored {r['score']:.2f}, expected <0.70)")
|
| 157 |
+
print(f" β FAIL β MD5 not caught, score={r['score']:.2f}")
|
| 158 |
+
else:
|
| 159 |
+
passes.append("static_analysis MD5")
|
| 160 |
+
print(f" β
PASS β MD5 caught, score={r['score']:.2f}")
|
| 161 |
+
|
| 162 |
+
# ββ Test 6: JWT bypass must be caught ββββββββββββββββββββββββββββββββββββ
|
| 163 |
+
print("\n[4] JWT bypass grader...")
|
| 164 |
+
r = grade_attack_resistance(INSECURE_JWT, "jwt_validator", seed=99)
|
| 165 |
+
if r["score"] > 0.4:
|
| 166 |
+
failures.append(f"FAIL jwt_validator: insecure JWT scored {r['score']:.2f} (expected <0.40)")
|
| 167 |
+
print(f" β FAIL β insecure JWT scored {r['score']:.2f} (should be <0.40)")
|
| 168 |
+
else:
|
| 169 |
+
passes.append("jwt_validator insecure")
|
| 170 |
+
print(f" β
PASS β insecure JWT scored {r['score']:.2f}")
|
| 171 |
+
|
| 172 |
+
r = grade_attack_resistance(SECURE_JWT, "jwt_validator", seed=99)
|
| 173 |
+
if r["score"] < 0.5:
|
| 174 |
+
failures.append(f"FAIL jwt_validator: SECURE code scored {r['score']:.2f} (expected >0.50)")
|
| 175 |
+
print(f" β FAIL β secure JWT scored {r['score']:.2f} (should be >0.50)")
|
| 176 |
+
else:
|
| 177 |
+
passes.append("jwt_validator secure")
|
| 178 |
+
print(f" β
PASS β secure JWT scored {r['score']:.2f}")
|
| 179 |
+
|
| 180 |
+
# ββ Test 7: API endpoints check ββββββββββββββββββββββββββββββββββββββββββ
|
| 181 |
+
print("\n[5] Task registry...")
|
| 182 |
+
try:
|
| 183 |
+
from tasks.task_registry import list_tasks, sample_task
|
| 184 |
+
tasks = list_tasks()
|
| 185 |
+
assert len(tasks) == 9, f"Expected 9 tasks, got {len(tasks)}"
|
| 186 |
+
for diff in ["easy", "medium", "hard"]:
|
| 187 |
+
t = sample_task(diff)
|
| 188 |
+
assert "id" in t and "problem_statement" in t and "test_cases" in t
|
| 189 |
+
passes.append("task_registry")
|
| 190 |
+
print(f" β
PASS β {len(tasks)} tasks registered correctly")
|
| 191 |
+
except Exception as e:
|
| 192 |
+
failures.append(f"FAIL task_registry: {e}")
|
| 193 |
+
print(f" β FAIL β {e}")
|
| 194 |
+
|
| 195 |
+
# ββ Test 8: CodeGraph βββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 196 |
+
print("\n[6] CodeGraph...")
|
| 197 |
+
try:
|
| 198 |
+
from codegraph.graph import CodeGraph
|
| 199 |
+
from codegraph.extractor import extract_metadata
|
| 200 |
+
g = CodeGraph(episode_seed=42)
|
| 201 |
+
meta = extract_metadata("def hello(x: int) -> str:\n return str(x)", "test.py", 0)
|
| 202 |
+
assert meta["status"] == "ok"
|
| 203 |
+
assert len(meta["functions"]) == 1
|
| 204 |
+
g.update("test.py", meta)
|
| 205 |
+
assert "naming" in g.conventions
|
| 206 |
+
passes.append("codegraph")
|
| 207 |
+
print(f" β
PASS β CodeGraph working, naming={g.conventions['naming']}")
|
| 208 |
+
except Exception as e:
|
| 209 |
+
failures.append(f"FAIL codegraph: {e}")
|
| 210 |
+
print(f" β FAIL β {e}")
|
| 211 |
+
|
| 212 |
+
# ββ Summary βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 213 |
+
print("\n" + "=" * 60)
|
| 214 |
+
if failures:
|
| 215 |
+
print(f"β VALIDATION FAILED β {len(failures)} check(s) failed:")
|
| 216 |
+
for f in failures:
|
| 217 |
+
print(f" β {f}")
|
| 218 |
+
print("\nDo NOT submit until all checks pass.")
|
| 219 |
+
sys.exit(1)
|
| 220 |
+
else:
|
| 221 |
+
print(f"β
ALL {len(passes)} CHECKS PASSED β Safe to submit to HuggingFace!")
|
| 222 |
+
print("=" * 60)
|
| 223 |
+
|
| 224 |
+
|
| 225 |
+
if __name__ == "__main__":
|
| 226 |
+
run_validation()
|