Spaces:

Codex47
/

SmartContractAudit

Running

App Files Files Community

ajaxwin commited on 11 days ago

Commit

88875f7

1 Parent(s): 1b91307

Structure Changed, files reviewed

Browse files

Files changed (28) hide show

Dockerfile +70 -25
agents/task1.py +1 -1
agents/task2.py +1 -1
agents/task3.py +1 -1
eval.py +1 -3
inference.py +23 -16
openenv.yaml +0 -9
pyproject.toml +45 -0
requirements.txt +2 -1
server/__init__.py +13 -0
app.py → server/app.py +10 -3
server/tasks/__init__.py +13 -0
{tasks → server/tasks}/task1/__init__.py +2 -2
{tasks → server/tasks}/task1/actions.py +0 -0
{tasks → server/tasks}/task1/environment.py +2 -2
{tasks → server/tasks}/task1/grader.py +0 -0
{tasks → server/tasks}/task2/__init__.py +2 -2
{tasks → server/tasks}/task2/actions.py +0 -0
{tasks → server/tasks}/task2/environment.py +2 -2
{tasks → server/tasks}/task2/grader.py +0 -0
{tasks → server/tasks}/task3/__init__.py +2 -2
{tasks → server/tasks}/task3/actions.py +0 -0
{tasks → server/tasks}/task3/environment.py +2 -2
{tasks → server/tasks}/task3/grader.py +0 -0
tasks/__init__.py +0 -1
uv.lock +0 -0
validate-submission.sh +185 -0
validate.py +0 -302

Dockerfile CHANGED Viewed

@@ -1,35 +1,80 @@
-# ---------------------------------------------------------------------------
-# Smart Contract Audit RL Environment
-# Hugging Face Space — Docker runtime
-# ---------------------------------------------------------------------------
-FROM python:3.11-slim
 WORKDIR /app
-# System deps
-RUN apt-get update && apt-get install -y --no-install-recommends \
-    curl \
-    && rm -rf /var/lib/apt/lists/*
-# Install Python deps first (layer cache)
-COPY requirements.txt .
-RUN pip install --no-cache-dir -r requirements.txt
-# Copy project
-COPY . .
-# Create empty __init__ files if missing (safety)
-RUN touch env/__init__.py tasks/__init__.py tasks/task1/__init__.py \
-         tasks/task2/__init__.py tasks/task3/__init__.py \
-         data/__init__.py utils/__init__.py
-# HF Spaces requires port 7860
-EXPOSE 7860
-# Healthcheck
-HEALTHCHECK --interval=30s --timeout=10s --start-period=15s --retries=3 \
-    CMD curl -f http://localhost:7860/health || exit 1
-# Launch FastAPI
-CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1"]

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# Multi-stage build using openenv-base
+# This Dockerfile is flexible and works for both:
+# - In-repo environments (with local OpenEnv sources)
+# - Standalone environments (with openenv from PyPI/Git)
+# The build script (openenv build) handles context detection and sets appropriate build args.
+ARG BASE_IMAGE=ghcr.io/meta-pytorch/openenv-base:latest
+FROM ${BASE_IMAGE} AS builder
 WORKDIR /app
+# Ensure git is available (required for installing dependencies from VCS)
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends git && \
+    rm -rf /var/lib/apt/lists/*
+# Build argument to control whether we're building standalone or in-repo
+ARG BUILD_MODE=in-repo
+ARG ENV_NAME=smartcontractenv
+# Copy environment code (always at root of build context)
+COPY . /app/env
+# For in-repo builds, openenv is already vendored in the build context
+# For standalone builds, openenv will be installed via pyproject.toml
+WORKDIR /app/env
+# Ensure uv is available (for local builds where base image lacks it)
+RUN if ! command -v uv >/dev/null 2>&1; then \
+        curl -LsSf https://astral.sh/uv/install.sh | sh && \
+        mv /root/.local/bin/uv /usr/local/bin/uv && \
+        mv /root/.local/bin/uvx /usr/local/bin/uvx; \
+    fi
+# Install dependencies using uv sync
+# If uv.lock exists, use it; otherwise resolve on the fly
+RUN --mount=type=cache,target=/root/.cache/uv \
+    if [ -f uv.lock ]; then \
+        uv sync --frozen --no-install-project --no-editable; \
+    else \
+        uv sync --no-install-project --no-editable; \
+    fi
+RUN --mount=type=cache,target=/root/.cache/uv \
+    if [ -f uv.lock ]; then \
+        uv sync --frozen --no-editable; \
+    else \
+        uv sync --no-editable; \
+    fi
+# Final runtime stage
+FROM ${BASE_IMAGE}
+WORKDIR /app
+# Copy the virtual environment from builder
+COPY --from=builder /app/env/.venv /app/.venv
+# Copy the environment code
+COPY --from=builder /app/env /app/env
+# Set PATH to use the virtual environment
+ENV PATH="/app/.venv/bin:$PATH"
+# Set PYTHONPATH so imports work correctly
+ENV PYTHONPATH="/app/env:$PYTHONPATH"
+# Health check
+HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
+    CMD curl -f http://localhost:8000/health || exit 1
+# Run the FastAPI server
+# The module path is constructed to work with the /app/env structure
+CMD ["sh", "-c", "cd /app/env && uvicorn server.app:app --host 0.0.0.0 --port 7860"]

agents/task1.py CHANGED Viewed

@@ -3,7 +3,7 @@
 import random as _random
 from typing import Any, Dict, List
-from tasks.task1 import Task1Environment
 from env.schemas import Action, ActionType
 from data.data_loader import load_contracts, get_function_by_name

 import random as _random
 from typing import Any, Dict, List
+from server import Task1Environment
 from env.schemas import Action, ActionType
 from data.data_loader import load_contracts, get_function_by_name

agents/task2.py CHANGED Viewed

@@ -3,7 +3,7 @@
 import random as _random
 from typing import Any, Dict, List
-from tasks.task2 import Task2Environment
 from env.schemas import Action, ActionType
 from data.data_loader import load_contracts, get_function_by_name

 import random as _random
 from typing import Any, Dict, List
+from server import Task2Environment
 from env.schemas import Action, ActionType
 from data.data_loader import load_contracts, get_function_by_name

agents/task3.py CHANGED Viewed

@@ -4,7 +4,7 @@ import json
 import random as _random
 from typing import Any, Dict, List
-from tasks.task3 import Task3Environment
 from env.schemas import Action, ActionType
 from data.data_loader import load_contracts, get_function_by_name

 import random as _random
 from typing import Any, Dict, List
+from server import Task3Environment
 from env.schemas import Action, ActionType
 from data.data_loader import load_contracts, get_function_by_name

eval.py CHANGED Viewed

@@ -21,9 +21,7 @@ import json
 import random as _random
 from typing import Any, Dict, List
-from tasks.task1 import Task1Environment
-from tasks.task2 import Task2Environment
-from tasks.task3 import Task3Environment
 from agents.task1 import oracle_t1, partial_t1, random_t1, floor_t1
 from agents.task2 import oracle_t2, partial_t2, random_t2, floor_t2
 from agents.task3 import oracle_t3, subfunction_t3, random_t3, floor_t3

 import random as _random
 from typing import Any, Dict, List
+from server import Task1Environment, Task2Environment, Task3Environment
 from agents.task1 import oracle_t1, partial_t1, random_t1, floor_t1
 from agents.task2 import oracle_t2, partial_t2, random_t2, floor_t2
 from agents.task3 import oracle_t3, subfunction_t3, random_t3, floor_t3

inference.py CHANGED Viewed

@@ -1,7 +1,7 @@
 """
 inference.py
 ------------
-Baseline inference script — Smart Contract Audit RL Environment.
 Implements agents for all three tasks using the OpenAI-compatible client.
 Emits mandatory structured stdout in the OpenEnv format.
@@ -32,22 +32,23 @@ from typing import Any, Dict, List, Optional
 from openai import OpenAI
-from tasks.task1 import Task1Environment
-from tasks.task2 import Task2Environment
-from tasks.task3 import Task3Environment
 from env.schemas import Action, ActionType
 from utils import T1_SYSTEM, T2_SYSTEM, T3_SYSTEM
 # ─────────────────────────────────────────────────────────────────────────────
 # Configuration
 # ─────────────────────────────────────────────────────────────────────────────
-API_BASE_URL = os.getenv("API_BASE_URL", "https://api.openai.com/v1")
-MODEL_NAME   = os.getenv("MODEL_NAME",   "gpt-4o-mini")
-HF_TOKEN     = os.getenv("HF_TOKEN",     "")
 if not HF_TOKEN:
     print("[WARN] HF_TOKEN not set — API calls may fail.", file=sys.stderr)
 # Benchmark / environment identifier (constant for this env)
 ENV_BENCHMARK = "smart-contract-audit"
@@ -126,9 +127,11 @@ def _run_t1_episode(env: Task1Environment, seed: int, ep_num: int) -> Dict[str,
     r   = env.reset(seed=seed)
     obs = r.observation.model_dump()
-    log_start(task="task1_vuln_detection", env=ENV_BENCHMARK, model=MODEL_NAME)
-    messages      = [{"role": "system", "content": T1_SYSTEM}]
     step_rewards: List[float] = []
     grader_score  = 0.0
     steps_taken   = 0
@@ -139,7 +142,7 @@ def _run_t1_episode(env: Task1Environment, seed: int, ep_num: int) -> Dict[str,
             messages.append({"role": "user", "content": _t1_user_msg(obs)})
             try:
                 resp = client.chat.completions.create(
-                    model=MODEL_NAME, messages=messages,
                     max_tokens=200, temperature=0.0,
                 )
                 raw = resp.choices[0].message.content.strip() # type: ignore
@@ -209,9 +212,11 @@ def _run_t2_episode(env: Task2Environment, seed: int, ep_num: int) -> Dict[str,
     obs = r.observation.model_dump()
     fn  = obs["extra"].get("target_function", "?")
-    log_start(task="task2_property_discovery", env=ENV_BENCHMARK, model=MODEL_NAME)
-    messages      = [{"role": "system", "content": T2_SYSTEM}]
     step_rewards: List[float] = []
     grader_score  = 0.0
     steps_taken   = 0
@@ -222,7 +227,7 @@ def _run_t2_episode(env: Task2Environment, seed: int, ep_num: int) -> Dict[str,
             messages.append({"role": "user", "content": _t2_user_msg(obs)})
             try:
                 resp = client.chat.completions.create(
-                    model=MODEL_NAME, messages=messages,
                     max_tokens=400, temperature=0.0,
                 )
                 raw = resp.choices[0].message.content.strip() # type: ignore
@@ -290,9 +295,11 @@ def _run_t3_episode(env: Task3Environment, seed: int, ep_num: int) -> Dict[str,
     r   = env.reset(seed=seed)
     obs = r.observation.model_dump()
-    log_start(task="task3_rule_checker", env=ENV_BENCHMARK, model=MODEL_NAME)
-    messages      = [{"role": "system", "content": T3_SYSTEM}]
     step_rewards: List[float] = []
     grader_score  = 0.0
     steps_taken   = 0
@@ -303,7 +310,7 @@ def _run_t3_episode(env: Task3Environment, seed: int, ep_num: int) -> Dict[str,
             messages.append({"role": "user", "content": _t3_user_msg(obs)})
             try:
                 resp = client.chat.completions.create(
-                    model=MODEL_NAME, messages=messages,
                     max_tokens=200, temperature=0.0,
                 )
                 raw = resp.choices[0].message.content.strip()   # type: ignore

 """
 inference.py
 ------------
+Inference script — Smart Contract Audit RL Environment.
 Implements agents for all three tasks using the OpenAI-compatible client.
 Emits mandatory structured stdout in the OpenEnv format.
 from openai import OpenAI
+from server import Task1Environment, Task2Environment, Task3Environment
 from env.schemas import Action, ActionType
 from utils import T1_SYSTEM, T2_SYSTEM, T3_SYSTEM
+from dotenv import dotenv_values
 # ─────────────────────────────────────────────────────────────────────────────
 # Configuration
 # ─────────────────────────────────────────────────────────────────────────────
+config = dotenv_values(".env")
+API_BASE_URL = config.get("API_BASE_URL", "https://api.openai.com/v1")
+MODEL_NAME   = config.get("MODEL_NAME",   "gpt-4o")
+HF_TOKEN     = config.get("HF_TOKEN",     "")
 if not HF_TOKEN:
     print("[WARN] HF_TOKEN not set — API calls may fail.", file=sys.stderr)
+    exit(1)
 # Benchmark / environment identifier (constant for this env)
 ENV_BENCHMARK = "smart-contract-audit"
     r   = env.reset(seed=seed)
     obs = r.observation.model_dump()
+    log_start(task="task1_vuln_detection", env=ENV_BENCHMARK, model=MODEL_NAME) # type: ignore
+    messages: List[ChatCompletionMessageParam] = [ # type: ignore
+        {"role": "system", "content": T1_SYSTEM}
+    ]
     step_rewards: List[float] = []
     grader_score  = 0.0
     steps_taken   = 0
             messages.append({"role": "user", "content": _t1_user_msg(obs)})
             try:
                 resp = client.chat.completions.create(
+                    model=MODEL_NAME, messages=messages, # type: ignore
                     max_tokens=200, temperature=0.0,
                 )
                 raw = resp.choices[0].message.content.strip() # type: ignore
     obs = r.observation.model_dump()
     fn  = obs["extra"].get("target_function", "?")
+    log_start(task="task2_property_discovery", env=ENV_BENCHMARK, model=MODEL_NAME) # type: ignore
+    messages: List[ChatCompletionMessageParam] = [ # type: ignore
+        {"role": "system", "content": T2_SYSTEM}
+    ]
     step_rewards: List[float] = []
     grader_score  = 0.0
     steps_taken   = 0
             messages.append({"role": "user", "content": _t2_user_msg(obs)})
             try:
                 resp = client.chat.completions.create(
+                    model=MODEL_NAME, messages=messages, # type: ignore
                     max_tokens=400, temperature=0.0,
                 )
                 raw = resp.choices[0].message.content.strip() # type: ignore
     r   = env.reset(seed=seed)
     obs = r.observation.model_dump()
+    log_start(task="task3_rule_checker", env=ENV_BENCHMARK, model=MODEL_NAME) # type: ignore
+    messages: List[ChatCompletionMessageParam] = [ # type: ignore
+        {"role": "system", "content": T3_SYSTEM}
+    ]
     step_rewards: List[float] = []
     grader_score  = 0.0
     steps_taken   = 0
             messages.append({"role": "user", "content": _t3_user_msg(obs)})
             try:
                 resp = client.chat.completions.create(
+                    model=MODEL_NAME, messages=messages, # type: ignore
                     max_tokens=200, temperature=0.0,
                 )
                 raw = resp.choices[0].message.content.strip()   # type: ignore

openenv.yaml CHANGED Viewed

@@ -113,15 +113,6 @@ data:
   num_vulnerable_functions: 8
   num_property_functions: 11
   num_task3_episodes: 8
-  vulnerability_types:
-    - Reentrancy
-    - Missing access control
-    - Integer overflow
-    - tx.origin authentication
-    - Front-running
-    - Timestamp dependence
-    - Denial of service (unbounded loop)
-    - Unchecked return value
 interface:
   http:

   num_vulnerable_functions: 8
   num_property_functions: 11
   num_task3_episodes: 8
 interface:
   http:

pyproject.toml ADDED Viewed

	@@ -0,0 +1,45 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+[build-system]
+requires = ["setuptools>=45", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "openenv-smartcontractenv"
+version = "0.1.0"
+description = "Smartcontractenv environment for OpenEnv"
+requires-python = ">=3.10"
+dependencies = [
+    # Core OpenEnv runtime (provides FastAPI server + HTTP client types)
+    # install from github
+    # "openenv-core[core] @ git+https://github.com/meta-pytorch/OpenEnv.git",
+    "openenv-core[core]>=0.2.2",
+    # Environment-specific dependencies
+    # Add all dependencies needed for your environment here
+    # Examples:
+    # "numpy>=1.19.0",
+    # "torch>=2.0.0",
+    # "gymnasium>=0.29.0",
+    # "openspiel>=1.0.0",
+    # "smolagents>=1.22.0,<2",
+]
+[project.optional-dependencies]
+dev = [
+    "pytest>=8.0.0",
+    "pytest-cov>=4.0.0",
+]
+[project.scripts]
+# Server entry point - enables running via: uv run --project . server
+# or: python -m smartcontractenv.server.app
+server = "smartcontractenv.server.app:main"
+[tool.setuptools]
+include-package-data = true
+packages = ["smartcontractenv", "smartcontractenv.server"]
+package-dir = { "smartcontractenv" = ".", "smartcontractenv.server" = "server" }

requirements.txt CHANGED Viewed

@@ -9,4 +9,5 @@ pandas==2.2.2
 numpy==2.1.1
 scikit-learn==1.5.0
 sentence-transformers==3.0.1
-nltk==3.9.4

 numpy==2.1.1
 scikit-learn==1.5.0
 sentence-transformers==3.0.1
+nltk==3.9.4
+openenv[core]>=0.2.0

server/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+"""
+This module initializes the server package and imports all task environments and graders.
+It serves as the central point for accessing all server components, including the main environment
+and task-specific environments and graders.
+"""
+from server.tasks import (
+    Task1Environment, Task1Grader,
+    Task2Environment, Task2Grader,
+    Task3Environment, Task3Grader
+)
+__all__ = ["Task1Environment", "Task1Grader", "Task2Environment", "Task2Grader", "Task3Environment", "Task3Grader"]

app.py → server/app.py RENAMED Viewed

@@ -16,7 +16,7 @@ Sessions are keyed by a UUID in the `session_id` query parameter.
 If omitted, "default" is used (fine for sequential single-agent runs).
 """
-from typing import Dict, Optional
 from fastapi import FastAPI, HTTPException, Query
 from pydantic import BaseModel
@@ -43,7 +43,7 @@ app = FastAPI(
 # Session management
 # ─────────────────────────────────────────────────────────────────────────────
-_sessions: Dict[str, object] = {}
 DEFAULT_SESSION = "default"
 TASK_ENV_MAP = {
@@ -81,6 +81,10 @@ class StepRequest(BaseModel):
 # Routes
 # ─────────────────────────────────────────────────────────────────────────────
 @app.get("/health")
 def health():
     """Liveness probe."""
@@ -236,6 +240,9 @@ def observation_space():
 # Entry point
 # ─────────────────────────────────────────────────────────────────────────────
-if __name__ == "__main__":
     import uvicorn
     uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=False)

 If omitted, "default" is used (fine for sequential single-agent runs).
 """
+from typing import Dict, Optional, Union
 from fastapi import FastAPI, HTTPException, Query
 from pydantic import BaseModel
 # Session management
 # ─────────────────────────────────────────────────────────────────────────────
+_sessions: Dict[str, Union[Task1Environment, Task2Environment, Task3Environment]] = {}
 DEFAULT_SESSION = "default"
 TASK_ENV_MAP = {
 # Routes
 # ─────────────────────────────────────────────────────────────────────────────
+@app.get("/")
+def root():
+    return {"message": "Welcome to the Smart Contract Audit RL Environment! Visit README.md for documentation."}
 @app.get("/health")
 def health():
     """Liveness probe."""
 # Entry point
 # ─────────────────────────────────────────────────────────────────────────────
+def main():
     import uvicorn
     uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=False)
+if __name__ == "__main__":
+    main()

server/tasks/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+# tasks package
+from server.tasks.task1 import Task1Environment, Task1Grader
+from server.tasks.task2 import Task2Environment, Task2Grader
+from server.tasks.task3 import Task3Environment, Task3Grader
+__all__ = [
+    "Task1Environment",
+    "Task1Grader",
+    "Task2Environment",
+    "Task2Grader",
+    "Task3Environment",
+    "Task3Grader",
+]

{tasks → server/tasks}/task1/__init__.py RENAMED Viewed

@@ -1,5 +1,5 @@
 # task1 package
-from tasks.task1.environment import Task1Environment
-from tasks.task1.grader import Task1Grader
 __all__ = ["Task1Environment", "Task1Grader"]

 # task1 package
+from server.tasks.task1.environment import Task1Environment
+from server.tasks.task1.grader import Task1Grader
 __all__ = ["Task1Environment", "Task1Grader"]

{tasks → server/tasks}/task1/actions.py RENAMED Viewed

File without changes

{tasks → server/tasks}/task1/environment.py RENAMED Viewed

@@ -38,8 +38,8 @@ from env.schemas import (
     StateResult,
     StepResult,
 )
-from tasks.task1.grader import Task1Grader
-from tasks.task1 import actions
 TASK_ID = "task1_vuln_detection"

     StateResult,
     StepResult,
 )
+from server.tasks.task1 import actions
+from .grader import Task1Grader
 TASK_ID = "task1_vuln_detection"

{tasks → server/tasks}/task1/grader.py RENAMED Viewed

File without changes

{tasks → server/tasks}/task2/__init__.py RENAMED Viewed

@@ -1,5 +1,5 @@
 # Task 2: Property Discovery
-from tasks.task2.environment import Task2Environment
-from tasks.task2.grader import Task2Grader
 __all__ = ["Task2Environment", "Task2Grader"]

 # Task 2: Property Discovery
+from server.tasks.task2.environment import Task2Environment
+from server.tasks.task2.grader import Task2Grader
 __all__ = ["Task2Environment", "Task2Grader"]

{tasks → server/tasks}/task2/actions.py RENAMED Viewed

File without changes

{tasks → server/tasks}/task2/environment.py RENAMED Viewed

@@ -39,8 +39,8 @@ from env.schemas import (
     StateResult,
     StepResult,
 )
-from tasks.task2.grader import Task2Grader
-from tasks.task2 import actions
 TASK_ID    = "task2_property_discovery"
 MAX_STEPS  = 15

     StateResult,
     StepResult,
 )
+from .grader import Task2Grader
+from server.tasks.task2 import actions
 TASK_ID    = "task2_property_discovery"
 MAX_STEPS  = 15

{tasks → server/tasks}/task2/grader.py RENAMED Viewed

File without changes

{tasks → server/tasks}/task3/__init__.py RENAMED Viewed

@@ -1,5 +1,5 @@
 # Task 3: Rule Checker
-from tasks.task3.environment import Task3Environment
-from tasks.task3.grader import Task3Grader
 __all__ = ["Task3Environment", "Task3Grader"]

 # Task 3: Rule Checker
+from server.tasks.task3.environment import Task3Environment
+from server.tasks.task3.grader import Task3Grader
 __all__ = ["Task3Environment", "Task3Grader"]

{tasks → server/tasks}/task3/actions.py RENAMED Viewed

File without changes

{tasks → server/tasks}/task3/environment.py RENAMED Viewed

@@ -35,7 +35,6 @@ from __future__ import annotations
 import random
 from typing import Any, Dict, List, Optional, Set
-from tasks.task3 import actions
 from data.data_loader import load_contracts, sample_task3_episode
 from env.base_env import BaseEnv
@@ -48,7 +47,8 @@ from env.schemas import (
     StateResult,
     StepResult,
 )
-from tasks.task3.grader import Task3Grader
 TASK_ID   = "task3_rule_checker"
 MAX_STEPS = 15

 import random
 from typing import Any, Dict, List, Optional, Set
 from data.data_loader import load_contracts, sample_task3_episode
 from env.base_env import BaseEnv
     StateResult,
     StepResult,
 )
+from .grader import Task3Grader
+from server.tasks.task3 import actions
 TASK_ID   = "task3_rule_checker"
 MAX_STEPS = 15

{tasks → server/tasks}/task3/grader.py RENAMED Viewed

File without changes

tasks/__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- # tasks package

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff

validate-submission.sh ADDED Viewed

	@@ -0,0 +1,185 @@

+#!/usr/bin/env bash
+#
+# validate-submission.sh — OpenEnv Submission Validator
+#
+# Checks that your HF Space is live, Docker image builds, and openenv validate passes.
+#
+# Prerequisites:
+#   - Docker:       https://docs.docker.com/get-docker/
+#   - openenv-core: pip install openenv-core
+#   - curl (usually pre-installed)
+#
+# Run:
+#   curl -fsSL https://raw.githubusercontent.com/<owner>/<repo>/main/scripts/validate-submission.sh | bash -s -- <ping_url> [repo_dir]
+#
+#   Or download and run locally:
+#     chmod +x validate-submission.sh
+#     ./validate-submission.sh <ping_url> [repo_dir]
+#
+# Arguments:
+#   ping_url   Your HuggingFace Space URL (e.g. https://your-space.hf.space)
+#   repo_dir   Path to your repo (default: current directory)
+#
+# Examples:
+#   ./validate-submission.sh https://my-team.hf.space
+#   ./validate-submission.sh https://my-team.hf.space ./my-repo
+#
+set -uo pipefail
+DOCKER_BUILD_TIMEOUT=600
+if [ -t 1 ]; then
+  RED='\033[0;31m'
+  GREEN='\033[0;32m'
+  YELLOW='\033[1;33m'
+  BOLD='\033[1m'
+  NC='\033[0m'
+else
+  RED='' GREEN='' YELLOW='' BOLD='' NC=''
+fi
+run_with_timeout() {
+  local secs="$1"; shift
+  if command -v timeout &>/dev/null; then
+    timeout "$secs" "$@"
+  elif command -v gtimeout &>/dev/null; then
+    gtimeout "$secs" "$@"
+  else
+    "$@" &
+    local pid=$!
+    ( sleep "$secs" && kill "$pid" 2>/dev/null ) &
+    local watcher=$!
+    wait "$pid" 2>/dev/null
+    local rc=$?
+    kill "$watcher" 2>/dev/null
+    wait "$watcher" 2>/dev/null
+    return $rc
+  fi
+}
+portable_mktemp() {
+  local prefix="${1:-validate}"
+  mktemp "${TMPDIR:-/tmp}/${prefix}-XXXXXX" 2>/dev/null || mktemp
+}
+CLEANUP_FILES=()
+cleanup() { rm -f "${CLEANUP_FILES[@]+"${CLEANUP_FILES[@]}"}"; }
+trap cleanup EXIT
+PING_URL="${1:-}"
+REPO_DIR="${2:-.}"
+if [ -z "$PING_URL" ]; then
+  printf "Usage: %s <ping_url> [repo_dir]\n" "$0"
+  printf "\n"
+  printf "  ping_url   Your HuggingFace Space URL (e.g. https://your-space.hf.space)\n"
+  printf "  repo_dir   Path to your repo (default: current directory)\n"
+  exit 1
+fi
+if ! REPO_DIR="$(cd "$REPO_DIR" 2>/dev/null && pwd)"; then
+  printf "Error: directory '%s' not found\n" "${2:-.}"
+  exit 1
+fi
+PING_URL="${PING_URL%/}"
+export PING_URL
+PASS=0
+log()  { printf "[%s] %b\n" "$(date -u +%H:%M:%S)" "$*"; }
+pass() { log "${GREEN}PASSED${NC} -- $1"; PASS=$((PASS + 1)); }
+fail() { log "${RED}FAILED${NC} -- $1"; }
+hint() { printf "  ${YELLOW}Hint:${NC} %b\n" "$1"; }
+stop_at() {
+  printf "\n"
+  printf "${RED}${BOLD}Validation stopped at %s.${NC} Fix the above before continuing.\n" "$1"
+  exit 1
+}
+printf "\n"
+printf "${BOLD}========================================${NC}\n"
+printf "${BOLD}  OpenEnv Submission Validator${NC}\n"
+printf "${BOLD}========================================${NC}\n"
+log "Repo:     $REPO_DIR"
+log "Ping URL: $PING_URL"
+printf "\n"
+log "${BOLD}Step 1/3: Pinging HF Space${NC} ($PING_URL/reset) ..."
+CURL_OUTPUT=$(portable_mktemp "validate-curl")
+CLEANUP_FILES+=("$CURL_OUTPUT")
+HTTP_CODE=$(curl -s -o "$CURL_OUTPUT" -w "%{http_code}" -X POST \
+  -H "Content-Type: application/json" -d '{}' \
+  "$PING_URL/reset" --max-time 30 2>"$CURL_OUTPUT" || printf "000")
+if [ "$HTTP_CODE" = "200" ]; then
+  pass "HF Space is live and responds to /reset"
+elif [ "$HTTP_CODE" = "000" ]; then
+  fail "HF Space not reachable (connection failed or timed out)"
+  hint "Check your network connection and that the Space is running."
+  hint "Try: curl -s -o /dev/null -w '%%{http_code}' -X POST $PING_URL/reset"
+  stop_at "Step 1"
+else
+  fail "HF Space /reset returned HTTP $HTTP_CODE (expected 200)"
+  hint "Make sure your Space is running and the URL is correct."
+  hint "Try opening $PING_URL in your browser first."
+  stop_at "Step 1"
+fi
+log "${BOLD}Step 2/3: Running docker build${NC} ..."
+if ! command -v docker &>/dev/null; then
+  fail "docker command not found"
+  hint "Install Docker: https://docs.docker.com/get-docker/"
+  stop_at "Step 2"
+fi
+if [ -f "$REPO_DIR/Dockerfile" ]; then
+  DOCKER_CONTEXT="$REPO_DIR"
+elif [ -f "$REPO_DIR/server/Dockerfile" ]; then
+  DOCKER_CONTEXT="$REPO_DIR/server"
+else
+  fail "No Dockerfile found in repo root or server/ directory"
+  stop_at "Step 2"
+fi
+log "  Found Dockerfile in $DOCKER_CONTEXT"
+BUILD_OK=false
+BUILD_OUTPUT=$(run_with_timeout "$DOCKER_BUILD_TIMEOUT" docker build "$DOCKER_CONTEXT" 2>&1) && BUILD_OK=true
+if [ "$BUILD_OK" = true ]; then
+  pass "Docker build succeeded"
+else
+  fail "Docker build failed (timeout=${DOCKER_BUILD_TIMEOUT}s)"
+  printf "%s\n" "$BUILD_OUTPUT" | tail -20
+  stop_at "Step 2"
+fi
+log "${BOLD}Step 3/3: Running openenv validate${NC} ..."
+if ! command -v openenv &>/dev/null; then
+  fail "openenv command not found"
+  hint "Install it: pip install openenv-core"
+  stop_at "Step 3"
+fi
+VALIDATE_OK=false
+VALIDATE_OUTPUT=$(cd "$REPO_DIR" && openenv validate 2>&1) && VALIDATE_OK=true
+if [ "$VALIDATE_OK" = true ]; then
+  pass "openenv validate passed"
+  [ -n "$VALIDATE_OUTPUT" ] && log "  $VALIDATE_OUTPUT"
+else
+  fail "openenv validate failed"
+  printf "%s\n" "$VALIDATE_OUTPUT"
+  stop_at "Step 3"
+fi
+printf "\n"
+printf "${BOLD}========================================${NC}\n"
+printf "${GREEN}${BOLD}  All 3/3 checks passed!${NC}\n"
+printf "${GREEN}${BOLD}  Your submission is ready to submit.${NC}\n"
+printf "${BOLD}========================================${NC}\n"
+printf "\n"
+exit 0

validate.py DELETED Viewed

@@ -1,302 +0,0 @@
-"""
-validate.py
------------
-Pre-submission validation — 24 checks across all three tasks.
-Usage: python validate.py
-Exit 0 = all pass. Exit 1 = failures.
-"""
-import json, sys
-from typing import Callable, List, Tuple
-PASS = "✅"; FAIL = "❌"
-results: List[Tuple[str, bool, str]] = []
-def check(name: str, fn: Callable) -> None:
-    try:
-        fn(); results.append((name, True, ""))
-        print(f"  {PASS} {name}")
-    except Exception as e:
-        results.append((name, False, str(e)))
-        print(f"  {FAIL} {name}\n       {e}")
-# ── Checks ────────────────────────────────────────────────────────────────────
-def check_imports():
-    from env.schemas import Observation, Action, Reward, StepResult, ResetResult, StateResult, ActionType
-    from tasks.task1.environment import Task1Environment; from tasks.task1.grader import Task1Grader
-    from tasks.task2.environment import Task2Environment; from tasks.task2.grader import Task2Grader
-    from tasks.task3.environment import Task3Environment; from tasks.task3.grader import Task3Grader
-    from data.data_loader import load_contracts
-def check_openenv_yaml():
-    import yaml
-    with open("openenv.yaml") as f: spec = yaml.safe_load(f)
-    assert "name" in spec and len(spec.get("tasks", [])) >= 3
-    assert "observation_space" in spec and "action_space" in spec and "reward" in spec
-    tasks = spec["tasks"]
-    active = [t for t in tasks if t.get("status") == "active"]
-    assert len(active) >= 2, f"Expected >=2 active tasks, got {len(active)}"
-def check_pydantic_models():
-    from env.schemas import Observation, Action, ActionType, Reward, StepResult, ResetResult
-    obs = Observation(task_id="t", contract_name="C", contract_description="D", available_actions=[])
-    for at in [ActionType.LIST_FUNCTIONS, ActionType.SUBMIT_PROPERTY,
-               ActionType.GET_PROPERTY_SPECIFICATION, ActionType.SUBMIT_FUNCTION]:
-        Action(action_type=at)
-    Reward(value=-1.5, reason="test")
-    StepResult(observation=obs, reward=Reward(value=0, reason=""), done=False)
-def check_data_loading():
-    from data.data_loader import (load_contracts, get_all_vulnerable_entries,
-                                   get_all_property_entries, get_all_task3_entries)
-    c = load_contracts()
-    assert len(get_all_vulnerable_entries(c)) >= 3
-    assert len(get_all_property_entries(c)) >= 3
-    entries = get_all_task3_entries(c)
-    assert len(entries) >= 3, f"Need >=3 task3 entries, got {len(entries)}"
-    for _, fn in entries:
-        t3 = fn.get("task3", {})
-        assert t3.get("property_english"), f"{fn['name']} missing property_english"
-        assert t3.get("property_formal"),  f"{fn['name']} missing property_formal"
-def check_t1_env():
-    from tasks.task1.environment import Task1Environment
-    from env.schemas import Action, ActionType
-    env = Task1Environment()
-    r = env.reset(seed=42); assert r.observation.task_id == "task1_vuln_detection"
-    s = env.step(Action(action_type=ActionType.LIST_FUNCTIONS))
-    assert s.reward.value == -0.05 and s.observation.step_count == 1
-    assert env.state().target_function is not None
-def check_t2_env():
-    from tasks.task2.environment import Task2Environment
-    from env.schemas import Action, ActionType
-    env = Task2Environment()
-    r = env.reset(seed=42)
-    assert r.observation.task_id == "task2_property_discovery"
-    assert "target_function" in r.observation.extra
-    for at in [ActionType.GET_FUNCTION_CODE, ActionType.GET_FUNCTION_NATSPEC,
-               ActionType.GET_FILE_NATSPEC, ActionType.GET_SIGNATURE,
-               ActionType.GET_RELATED_FUNCTIONS, ActionType.GET_SIMILAR_RULE]:
-        env.step(Action(action_type=at))
-def check_t3_env():
-    from tasks.task3.environment import Task3Environment
-    from env.schemas import Action, ActionType
-    env = Task3Environment()
-    r = env.reset(seed=42)
-    assert r.observation.task_id == "task3_rule_checker"
-    assert "property_english" in r.observation.extra
-    prop = r.observation.extra["property_english"]
-    assert len(prop) > 10, "property_english too short"
-    for at in [ActionType.LIST_FUNCTIONS, ActionType.GET_PROPERTY_SPECIFICATION,
-               ActionType.GET_CALL_GRAPH, ActionType.GET_STATE_VARIABLE]:
-        s = env.step(Action(action_type=at))
-        assert s.reward.value < 0, f"{at.value} should have negative shaping reward"
-def check_t3_action_costs():
-    from tasks.task3.environment import Task3Environment
-    from env.schemas import Action, ActionType
-    env = Task3Environment(); env.reset(seed=42)
-    costs = {
-        ActionType.GET_PROPERTY_SPECIFICATION: -0.03,
-        ActionType.LIST_FUNCTIONS: -0.05,
-        ActionType.GET_CALL_GRAPH: -0.08,
-    }
-    for at, expected in costs.items():
-        e2 = Task3Environment(); e2.reset(seed=42)
-        s = e2.step(Action(action_type=at))
-        assert abs(s.reward.value - expected) < 0.001, \
-            f"{at.value}: expected {expected}, got {s.reward.value}"
-def check_t3_function_metadata():
-    from tasks.task3.environment import Task3Environment
-    from env.schemas import Action, ActionType
-    env = Task3Environment(); env.reset(seed=43)
-    s = env.step(Action(action_type=ActionType.GET_FUNCTION_METADATA,
-                         params={"function_name": "withdraw"}))
-    assert "Visibility" in s.observation.last_action_result
-    assert s.reward.value == -0.05
-def check_t3_submit_correct():
-    from tasks.task3.environment import Task3Environment
-    from env.schemas import Action, ActionType
-    env = Task3Environment(); env.reset(seed=42)
-    target = env.state().target_function
-    s = env.step(Action(action_type=ActionType.SUBMIT_FUNCTION,
-                         params={"function_name": target}))
-    assert s.done and s.reward.value == 5.0, \
-        f"Expected reward=5.0, got {s.reward.value}"
-def check_t3_submit_subfunction():
-    from tasks.task3.environment import Task3Environment
-    from env.schemas import Action, ActionType
-    # seed 45 → bid with subfunction getPrice
-    env = Task3Environment(); env.reset(seed=45)
-    assert env.state().target_function == "bid"
-    s = env.step(Action(action_type=ActionType.SUBMIT_FUNCTION,
-                         params={"function_name": "getPrice"}))
-    assert s.done and s.reward.value == 1.5, \
-        f"Expected partial reward=1.5, got {s.reward.value}"
-def check_t3_submit_wrong():
-    from tasks.task3.environment import Task3Environment
-    from env.schemas import Action, ActionType
-    env = Task3Environment(); env.reset(seed=42)
-    s = env.step(Action(action_type=ActionType.SUBMIT_FUNCTION,
-                         params={"function_name": "constructor"}))
-    assert s.done and s.reward.value == -1.5
-def check_t3_one_submit_only():
-    from tasks.task3.environment import Task3Environment
-    from env.schemas import Action, ActionType
-    env = Task3Environment(); env.reset(seed=42)
-    env.step(Action(action_type=ActionType.SUBMIT_FUNCTION,
-                     params={"function_name": "deposit"}))
-    try:
-        env.step(Action(action_type=ActionType.LIST_FUNCTIONS))
-        raise AssertionError("Should raise RuntimeError after done")
-    except RuntimeError:
-        pass
-def check_t3_repeated_penalty():
-    from tasks.task3.environment import Task3Environment
-    from env.schemas import Action, ActionType
-    env = Task3Environment(); env.reset(seed=42)
-    env.step(Action(action_type=ActionType.LIST_FUNCTIONS))
-    s = env.step(Action(action_type=ActionType.LIST_FUNCTIONS))
-    assert s.reward.value == -0.40
-def check_t1_grader():
-    from tasks.task1.grader import Task1Grader
-    g = Task1Grader("withdraw", "Reentrancy vulnerability")
-    assert g.grade_submission("withdraw", "reentrancy") == 1.0
-    assert g.grade_submission("withdraw", "vague") == 0.5
-    assert g.grade_submission("deposit", "reentrancy") == 0.0
-def check_t2_grader():
-    from tasks.task2.grader import Task2Grader
-    from data.data_loader import load_contracts, get_all_property_entries
-    for c, fn in get_all_property_entries(load_contracts()):
-        g = Task2Grader(fn["name"], fn["property"])
-        assert g.grade(fn["property"])[0] >= 0.65
-        assert g.grade("") == 0.0
-        s = g.grade("test"); assert s == g.grade("test")  # deterministic
-def check_t3_grader():
-    from tasks.task3.grader import Task3Grader
-    g = Task3Grader("withdraw", ["deposit"], "some rule")
-    assert g.grade("withdraw") == 1.0
-    assert g.grade("WITHDRAW") == 1.0  # case-insensitive
-    assert g.grade("deposit") == 0.3
-    assert g.grade("constructor") == 0.0
-    s, r = g.grade_and_reward("withdraw"); assert s == 1.0 and r == 5.0
-    s, r = g.grade_and_reward("deposit");  assert s == 0.3 and r == 1.5
-    s, r = g.grade_and_reward("other");    assert s == 0.0 and r == -1.5
-def check_reward_shaping():
-    from tasks.task3.environment import Task3Environment
-    from env.schemas import Action, ActionType
-    env = Task3Environment(); env.reset(seed=1)
-    rewards = {env.step(Action(action_type=at)).reward.value
-               for at in [ActionType.LIST_FUNCTIONS,
-                           ActionType.GET_PROPERTY_SPECIFICATION,
-                           ActionType.GET_CALL_GRAPH]}
-    assert len(rewards) >= 2
-def check_app_imports():
-    from app import app
-    from fastapi.testclient import TestClient
-    client = TestClient(app)
-    r = client.get("/health"); assert r.status_code == 200
-    tasks = client.get("/tasks").json()["tasks"]
-    active = [t for t in tasks if t["status"] == "active"]
-    assert len(active) == 3, f"Expected 3 active tasks, got {len(active)}: {active}"
-def check_t3_http_reset():
-    from app import app
-    from fastapi.testclient import TestClient
-    client = TestClient(app)
-    r = client.post("/reset", json={"task_id": "task3_rule_checker", "seed": 42})
-    assert r.status_code == 200
-    obs = r.json()["observation"]
-    assert obs["task_id"] == "task3_rule_checker"
-    assert "property_english" in obs["extra"]
-def check_dockerfile():
-    import os
-    assert os.path.exists("Dockerfile")
-    c = open("Dockerfile").read()
-    assert "7860" in c and ("uvicorn" in c or "CMD" in c)
-def check_inference_script():
-    import os
-    assert os.path.exists("inference.py")
-    c = open("inference.py").read()
-    assert "HF_TOKEN" in c and "API_BASE_URL" in c and "MODEL_NAME" in c
-    assert "Task3Environment" in c or "run_task3" in c
-    assert "submit_function" in c
-def check_baseline_json():
-    import os
-    if not os.path.exists("baseline_scores.json"): return
-    data = json.load(open("baseline_scores.json"))
-    for t in data.get("tasks", []):
-        assert 0.0 <= t["avg_grader_score"] <= 1.0
-# ── Runner ────────────────────────────────────────────────────────────────────
-ALL_CHECKS = [
-    ("Python imports (T1+T2+T3)",           check_imports),
-    ("openenv.yaml: 3 tasks, ≥2 active",    check_openenv_yaml),
-    ("Pydantic models (all ActionTypes)",   check_pydantic_models),
-    ("Dataset: vuln+property+task3 entries",check_data_loading),
-    ("T1 env: reset/step/state",            check_t1_env),
-    ("T2 env: reset + 6 browse actions",    check_t2_env),
-    ("T3 env: reset + browse actions",      check_t3_env),
-    ("T3 action costs (formalized -0.03)",  check_t3_action_costs),
-    ("T3 get_function_metadata",            check_t3_function_metadata),
-    ("T3 submit correct → +5.0",            check_t3_submit_correct),
-    ("T3 submit subfunction → +1.5",        check_t3_submit_subfunction),
-    ("T3 submit wrong → -1.5",              check_t3_submit_wrong),
-    ("T3 one submit per episode",           check_t3_one_submit_only),
-    ("T3 repeated query → -0.40",           check_t3_repeated_penalty),
-    ("T1 grader: 0/0.5/1.0 rubric",        check_t1_grader),
-    ("T2 grader: all 11 properties",        check_t2_grader),
-    ("T3 grader: 1.0/0.3/0.0 + case-ins.", check_t3_grader),
-    ("Reward shaping non-binary (T3)",      check_reward_shaping),
-    ("FastAPI: 3 active tasks",             check_app_imports),
-    ("FastAPI: T3 reset endpoint",          check_t3_http_reset),
-    ("Dockerfile + port 7860",              check_dockerfile),
-    ("inference.py: T3 code present",       check_inference_script),
-    ("baseline_scores.json schema",         check_baseline_json),
-]
-def main():
-    print("=" * 64)
-    print("OpenEnv Pre-Submission Validation  (Task 1 + 2 + 3)")
-    print("=" * 64)
-    print()
-    for name, fn in ALL_CHECKS:
-        check(name, fn)
-    passed = sum(1 for _, ok, _ in results if ok)
-    total  = len(results)
-    failed = [(n, m) for n, ok, m in results if not ok]
-    print()
-    print("=" * 64)
-    print(f"Results: {passed}/{total} checks passed")
-    if failed:
-        print("\nFailed checks:")
-        for n, m in failed:
-            print(f"  {FAIL} {n}: {m}")
-        print("\n❌ VALIDATION FAILED")
-        sys.exit(1)
-    else:
-        print("\n✅ ALL CHECKS PASSED — ready to submit!")
-        sys.exit(0)
-if __name__ == "__main__":
-    main()