ajaxwin commited on
Commit
88875f7
Β·
1 Parent(s): 1b91307

Structure Changed, files reviewed

Browse files
Dockerfile CHANGED
@@ -1,35 +1,80 @@
1
- # ---------------------------------------------------------------------------
2
- # Smart Contract Audit RL Environment
3
- # Hugging Face Space β€” Docker runtime
4
- # ---------------------------------------------------------------------------
 
5
 
6
- FROM python:3.11-slim
 
 
 
 
 
 
 
7
 
8
  WORKDIR /app
9
 
10
- # System deps
11
- RUN apt-get update && apt-get install -y --no-install-recommends \
12
- curl \
13
- && rm -rf /var/lib/apt/lists/*
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
- # Install Python deps first (layer cache)
16
- COPY requirements.txt .
17
- RUN pip install --no-cache-dir -r requirements.txt
18
 
19
- # Copy project
20
- COPY . .
21
 
22
- # Create empty __init__ files if missing (safety)
23
- RUN touch env/__init__.py tasks/__init__.py tasks/task1/__init__.py \
24
- tasks/task2/__init__.py tasks/task3/__init__.py \
25
- data/__init__.py utils/__init__.py
26
 
27
- # HF Spaces requires port 7860
28
- EXPOSE 7860
29
 
30
- # Healthcheck
31
- HEALTHCHECK --interval=30s --timeout=10s --start-period=15s --retries=3 \
32
- CMD curl -f http://localhost:7860/health || exit 1
33
 
34
- # Launch FastAPI
35
- CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1"]
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
 
7
+ # Multi-stage build using openenv-base
8
+ # This Dockerfile is flexible and works for both:
9
+ # - In-repo environments (with local OpenEnv sources)
10
+ # - Standalone environments (with openenv from PyPI/Git)
11
+ # The build script (openenv build) handles context detection and sets appropriate build args.
12
+
13
+ ARG BASE_IMAGE=ghcr.io/meta-pytorch/openenv-base:latest
14
+ FROM ${BASE_IMAGE} AS builder
15
 
16
  WORKDIR /app
17
 
18
+ # Ensure git is available (required for installing dependencies from VCS)
19
+ RUN apt-get update && \
20
+ apt-get install -y --no-install-recommends git && \
21
+ rm -rf /var/lib/apt/lists/*
22
+
23
+ # Build argument to control whether we're building standalone or in-repo
24
+ ARG BUILD_MODE=in-repo
25
+ ARG ENV_NAME=smartcontractenv
26
+
27
+ # Copy environment code (always at root of build context)
28
+ COPY . /app/env
29
+
30
+ # For in-repo builds, openenv is already vendored in the build context
31
+ # For standalone builds, openenv will be installed via pyproject.toml
32
+ WORKDIR /app/env
33
+
34
+ # Ensure uv is available (for local builds where base image lacks it)
35
+ RUN if ! command -v uv >/dev/null 2>&1; then \
36
+ curl -LsSf https://astral.sh/uv/install.sh | sh && \
37
+ mv /root/.local/bin/uv /usr/local/bin/uv && \
38
+ mv /root/.local/bin/uvx /usr/local/bin/uvx; \
39
+ fi
40
+
41
+ # Install dependencies using uv sync
42
+ # If uv.lock exists, use it; otherwise resolve on the fly
43
+ RUN --mount=type=cache,target=/root/.cache/uv \
44
+ if [ -f uv.lock ]; then \
45
+ uv sync --frozen --no-install-project --no-editable; \
46
+ else \
47
+ uv sync --no-install-project --no-editable; \
48
+ fi
49
+
50
+ RUN --mount=type=cache,target=/root/.cache/uv \
51
+ if [ -f uv.lock ]; then \
52
+ uv sync --frozen --no-editable; \
53
+ else \
54
+ uv sync --no-editable; \
55
+ fi
56
+
57
+ # Final runtime stage
58
+ FROM ${BASE_IMAGE}
59
+
60
+ WORKDIR /app
61
 
62
+ # Copy the virtual environment from builder
63
+ COPY --from=builder /app/env/.venv /app/.venv
 
64
 
65
+ # Copy the environment code
66
+ COPY --from=builder /app/env /app/env
67
 
68
+ # Set PATH to use the virtual environment
69
+ ENV PATH="/app/.venv/bin:$PATH"
 
 
70
 
71
+ # Set PYTHONPATH so imports work correctly
72
+ ENV PYTHONPATH="/app/env:$PYTHONPATH"
73
 
74
+ # Health check
75
+ HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
76
+ CMD curl -f http://localhost:8000/health || exit 1
77
 
78
+ # Run the FastAPI server
79
+ # The module path is constructed to work with the /app/env structure
80
+ CMD ["sh", "-c", "cd /app/env && uvicorn server.app:app --host 0.0.0.0 --port 7860"]
agents/task1.py CHANGED
@@ -3,7 +3,7 @@
3
  import random as _random
4
  from typing import Any, Dict, List
5
 
6
- from tasks.task1 import Task1Environment
7
  from env.schemas import Action, ActionType
8
  from data.data_loader import load_contracts, get_function_by_name
9
 
 
3
  import random as _random
4
  from typing import Any, Dict, List
5
 
6
+ from server import Task1Environment
7
  from env.schemas import Action, ActionType
8
  from data.data_loader import load_contracts, get_function_by_name
9
 
agents/task2.py CHANGED
@@ -3,7 +3,7 @@
3
  import random as _random
4
  from typing import Any, Dict, List
5
 
6
- from tasks.task2 import Task2Environment
7
  from env.schemas import Action, ActionType
8
  from data.data_loader import load_contracts, get_function_by_name
9
 
 
3
  import random as _random
4
  from typing import Any, Dict, List
5
 
6
+ from server import Task2Environment
7
  from env.schemas import Action, ActionType
8
  from data.data_loader import load_contracts, get_function_by_name
9
 
agents/task3.py CHANGED
@@ -4,7 +4,7 @@ import json
4
  import random as _random
5
  from typing import Any, Dict, List
6
 
7
- from tasks.task3 import Task3Environment
8
  from env.schemas import Action, ActionType
9
  from data.data_loader import load_contracts, get_function_by_name
10
 
 
4
  import random as _random
5
  from typing import Any, Dict, List
6
 
7
+ from server import Task3Environment
8
  from env.schemas import Action, ActionType
9
  from data.data_loader import load_contracts, get_function_by_name
10
 
eval.py CHANGED
@@ -21,9 +21,7 @@ import json
21
  import random as _random
22
  from typing import Any, Dict, List
23
 
24
- from tasks.task1 import Task1Environment
25
- from tasks.task2 import Task2Environment
26
- from tasks.task3 import Task3Environment
27
  from agents.task1 import oracle_t1, partial_t1, random_t1, floor_t1
28
  from agents.task2 import oracle_t2, partial_t2, random_t2, floor_t2
29
  from agents.task3 import oracle_t3, subfunction_t3, random_t3, floor_t3
 
21
  import random as _random
22
  from typing import Any, Dict, List
23
 
24
+ from server import Task1Environment, Task2Environment, Task3Environment
 
 
25
  from agents.task1 import oracle_t1, partial_t1, random_t1, floor_t1
26
  from agents.task2 import oracle_t2, partial_t2, random_t2, floor_t2
27
  from agents.task3 import oracle_t3, subfunction_t3, random_t3, floor_t3
inference.py CHANGED
@@ -1,7 +1,7 @@
1
  """
2
  inference.py
3
  ------------
4
- Baseline inference script β€” Smart Contract Audit RL Environment.
5
 
6
  Implements agents for all three tasks using the OpenAI-compatible client.
7
  Emits mandatory structured stdout in the OpenEnv format.
@@ -32,22 +32,23 @@ from typing import Any, Dict, List, Optional
32
 
33
  from openai import OpenAI
34
 
35
- from tasks.task1 import Task1Environment
36
- from tasks.task2 import Task2Environment
37
- from tasks.task3 import Task3Environment
38
  from env.schemas import Action, ActionType
39
  from utils import T1_SYSTEM, T2_SYSTEM, T3_SYSTEM
 
40
 
41
  # ─────────────────────────────────────────────────────────────────────────────
42
  # Configuration
43
  # ─────────────────────────────────────────────────────────────────────────────
44
 
45
- API_BASE_URL = os.getenv("API_BASE_URL", "https://api.openai.com/v1")
46
- MODEL_NAME = os.getenv("MODEL_NAME", "gpt-4o-mini")
47
- HF_TOKEN = os.getenv("HF_TOKEN", "")
 
48
 
49
  if not HF_TOKEN:
50
  print("[WARN] HF_TOKEN not set β€” API calls may fail.", file=sys.stderr)
 
51
 
52
  # Benchmark / environment identifier (constant for this env)
53
  ENV_BENCHMARK = "smart-contract-audit"
@@ -126,9 +127,11 @@ def _run_t1_episode(env: Task1Environment, seed: int, ep_num: int) -> Dict[str,
126
  r = env.reset(seed=seed)
127
  obs = r.observation.model_dump()
128
 
129
- log_start(task="task1_vuln_detection", env=ENV_BENCHMARK, model=MODEL_NAME)
130
 
131
- messages = [{"role": "system", "content": T1_SYSTEM}]
 
 
132
  step_rewards: List[float] = []
133
  grader_score = 0.0
134
  steps_taken = 0
@@ -139,7 +142,7 @@ def _run_t1_episode(env: Task1Environment, seed: int, ep_num: int) -> Dict[str,
139
  messages.append({"role": "user", "content": _t1_user_msg(obs)})
140
  try:
141
  resp = client.chat.completions.create(
142
- model=MODEL_NAME, messages=messages,
143
  max_tokens=200, temperature=0.0,
144
  )
145
  raw = resp.choices[0].message.content.strip() # type: ignore
@@ -209,9 +212,11 @@ def _run_t2_episode(env: Task2Environment, seed: int, ep_num: int) -> Dict[str,
209
  obs = r.observation.model_dump()
210
  fn = obs["extra"].get("target_function", "?")
211
 
212
- log_start(task="task2_property_discovery", env=ENV_BENCHMARK, model=MODEL_NAME)
213
 
214
- messages = [{"role": "system", "content": T2_SYSTEM}]
 
 
215
  step_rewards: List[float] = []
216
  grader_score = 0.0
217
  steps_taken = 0
@@ -222,7 +227,7 @@ def _run_t2_episode(env: Task2Environment, seed: int, ep_num: int) -> Dict[str,
222
  messages.append({"role": "user", "content": _t2_user_msg(obs)})
223
  try:
224
  resp = client.chat.completions.create(
225
- model=MODEL_NAME, messages=messages,
226
  max_tokens=400, temperature=0.0,
227
  )
228
  raw = resp.choices[0].message.content.strip() # type: ignore
@@ -290,9 +295,11 @@ def _run_t3_episode(env: Task3Environment, seed: int, ep_num: int) -> Dict[str,
290
  r = env.reset(seed=seed)
291
  obs = r.observation.model_dump()
292
 
293
- log_start(task="task3_rule_checker", env=ENV_BENCHMARK, model=MODEL_NAME)
294
 
295
- messages = [{"role": "system", "content": T3_SYSTEM}]
 
 
296
  step_rewards: List[float] = []
297
  grader_score = 0.0
298
  steps_taken = 0
@@ -303,7 +310,7 @@ def _run_t3_episode(env: Task3Environment, seed: int, ep_num: int) -> Dict[str,
303
  messages.append({"role": "user", "content": _t3_user_msg(obs)})
304
  try:
305
  resp = client.chat.completions.create(
306
- model=MODEL_NAME, messages=messages,
307
  max_tokens=200, temperature=0.0,
308
  )
309
  raw = resp.choices[0].message.content.strip() # type: ignore
 
1
  """
2
  inference.py
3
  ------------
4
+ Inference script β€” Smart Contract Audit RL Environment.
5
 
6
  Implements agents for all three tasks using the OpenAI-compatible client.
7
  Emits mandatory structured stdout in the OpenEnv format.
 
32
 
33
  from openai import OpenAI
34
 
35
+ from server import Task1Environment, Task2Environment, Task3Environment
 
 
36
  from env.schemas import Action, ActionType
37
  from utils import T1_SYSTEM, T2_SYSTEM, T3_SYSTEM
38
+ from dotenv import dotenv_values
39
 
40
  # ─────────────────────────────────────────────────────────────────────────────
41
  # Configuration
42
  # ─────────────────────────────────────────────────────────────────────────────
43
 
44
+ config = dotenv_values(".env")
45
+ API_BASE_URL = config.get("API_BASE_URL", "https://api.openai.com/v1")
46
+ MODEL_NAME = config.get("MODEL_NAME", "gpt-4o")
47
+ HF_TOKEN = config.get("HF_TOKEN", "")
48
 
49
  if not HF_TOKEN:
50
  print("[WARN] HF_TOKEN not set β€” API calls may fail.", file=sys.stderr)
51
+ exit(1)
52
 
53
  # Benchmark / environment identifier (constant for this env)
54
  ENV_BENCHMARK = "smart-contract-audit"
 
127
  r = env.reset(seed=seed)
128
  obs = r.observation.model_dump()
129
 
130
+ log_start(task="task1_vuln_detection", env=ENV_BENCHMARK, model=MODEL_NAME) # type: ignore
131
 
132
+ messages: List[ChatCompletionMessageParam] = [ # type: ignore
133
+ {"role": "system", "content": T1_SYSTEM}
134
+ ]
135
  step_rewards: List[float] = []
136
  grader_score = 0.0
137
  steps_taken = 0
 
142
  messages.append({"role": "user", "content": _t1_user_msg(obs)})
143
  try:
144
  resp = client.chat.completions.create(
145
+ model=MODEL_NAME, messages=messages, # type: ignore
146
  max_tokens=200, temperature=0.0,
147
  )
148
  raw = resp.choices[0].message.content.strip() # type: ignore
 
212
  obs = r.observation.model_dump()
213
  fn = obs["extra"].get("target_function", "?")
214
 
215
+ log_start(task="task2_property_discovery", env=ENV_BENCHMARK, model=MODEL_NAME) # type: ignore
216
 
217
+ messages: List[ChatCompletionMessageParam] = [ # type: ignore
218
+ {"role": "system", "content": T2_SYSTEM}
219
+ ]
220
  step_rewards: List[float] = []
221
  grader_score = 0.0
222
  steps_taken = 0
 
227
  messages.append({"role": "user", "content": _t2_user_msg(obs)})
228
  try:
229
  resp = client.chat.completions.create(
230
+ model=MODEL_NAME, messages=messages, # type: ignore
231
  max_tokens=400, temperature=0.0,
232
  )
233
  raw = resp.choices[0].message.content.strip() # type: ignore
 
295
  r = env.reset(seed=seed)
296
  obs = r.observation.model_dump()
297
 
298
+ log_start(task="task3_rule_checker", env=ENV_BENCHMARK, model=MODEL_NAME) # type: ignore
299
 
300
+ messages: List[ChatCompletionMessageParam] = [ # type: ignore
301
+ {"role": "system", "content": T3_SYSTEM}
302
+ ]
303
  step_rewards: List[float] = []
304
  grader_score = 0.0
305
  steps_taken = 0
 
310
  messages.append({"role": "user", "content": _t3_user_msg(obs)})
311
  try:
312
  resp = client.chat.completions.create(
313
+ model=MODEL_NAME, messages=messages, # type: ignore
314
  max_tokens=200, temperature=0.0,
315
  )
316
  raw = resp.choices[0].message.content.strip() # type: ignore
openenv.yaml CHANGED
@@ -113,15 +113,6 @@ data:
113
  num_vulnerable_functions: 8
114
  num_property_functions: 11
115
  num_task3_episodes: 8
116
- vulnerability_types:
117
- - Reentrancy
118
- - Missing access control
119
- - Integer overflow
120
- - tx.origin authentication
121
- - Front-running
122
- - Timestamp dependence
123
- - Denial of service (unbounded loop)
124
- - Unchecked return value
125
 
126
  interface:
127
  http:
 
113
  num_vulnerable_functions: 8
114
  num_property_functions: 11
115
  num_task3_episodes: 8
 
 
 
 
 
 
 
 
 
116
 
117
  interface:
118
  http:
pyproject.toml ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ [build-system]
8
+ requires = ["setuptools>=45", "wheel"]
9
+ build-backend = "setuptools.build_meta"
10
+
11
+ [project]
12
+ name = "openenv-smartcontractenv"
13
+ version = "0.1.0"
14
+ description = "Smartcontractenv environment for OpenEnv"
15
+ requires-python = ">=3.10"
16
+ dependencies = [
17
+ # Core OpenEnv runtime (provides FastAPI server + HTTP client types)
18
+ # install from github
19
+ # "openenv-core[core] @ git+https://github.com/meta-pytorch/OpenEnv.git",
20
+ "openenv-core[core]>=0.2.2",
21
+ # Environment-specific dependencies
22
+ # Add all dependencies needed for your environment here
23
+ # Examples:
24
+ # "numpy>=1.19.0",
25
+ # "torch>=2.0.0",
26
+ # "gymnasium>=0.29.0",
27
+ # "openspiel>=1.0.0",
28
+ # "smolagents>=1.22.0,<2",
29
+ ]
30
+
31
+ [project.optional-dependencies]
32
+ dev = [
33
+ "pytest>=8.0.0",
34
+ "pytest-cov>=4.0.0",
35
+ ]
36
+
37
+ [project.scripts]
38
+ # Server entry point - enables running via: uv run --project . server
39
+ # or: python -m smartcontractenv.server.app
40
+ server = "smartcontractenv.server.app:main"
41
+
42
+ [tool.setuptools]
43
+ include-package-data = true
44
+ packages = ["smartcontractenv", "smartcontractenv.server"]
45
+ package-dir = { "smartcontractenv" = ".", "smartcontractenv.server" = "server" }
requirements.txt CHANGED
@@ -9,4 +9,5 @@ pandas==2.2.2
9
  numpy==2.1.1
10
  scikit-learn==1.5.0
11
  sentence-transformers==3.0.1
12
- nltk==3.9.4
 
 
9
  numpy==2.1.1
10
  scikit-learn==1.5.0
11
  sentence-transformers==3.0.1
12
+ nltk==3.9.4
13
+ openenv[core]>=0.2.0
server/__init__.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This module initializes the server package and imports all task environments and graders.
3
+ It serves as the central point for accessing all server components, including the main environment
4
+ and task-specific environments and graders.
5
+ """
6
+
7
+ from server.tasks import (
8
+ Task1Environment, Task1Grader,
9
+ Task2Environment, Task2Grader,
10
+ Task3Environment, Task3Grader
11
+ )
12
+
13
+ __all__ = ["Task1Environment", "Task1Grader", "Task2Environment", "Task2Grader", "Task3Environment", "Task3Grader"]
app.py β†’ server/app.py RENAMED
@@ -16,7 +16,7 @@ Sessions are keyed by a UUID in the `session_id` query parameter.
16
  If omitted, "default" is used (fine for sequential single-agent runs).
17
  """
18
 
19
- from typing import Dict, Optional
20
 
21
  from fastapi import FastAPI, HTTPException, Query
22
  from pydantic import BaseModel
@@ -43,7 +43,7 @@ app = FastAPI(
43
  # Session management
44
  # ─────────────────────────────────────────────────────────────────────────────
45
 
46
- _sessions: Dict[str, object] = {}
47
  DEFAULT_SESSION = "default"
48
 
49
  TASK_ENV_MAP = {
@@ -81,6 +81,10 @@ class StepRequest(BaseModel):
81
  # Routes
82
  # ─────────────────────────────────────────────────────────────────────────────
83
 
 
 
 
 
84
  @app.get("/health")
85
  def health():
86
  """Liveness probe."""
@@ -236,6 +240,9 @@ def observation_space():
236
  # Entry point
237
  # ─────────────────────────────────────────────────────────────────────────────
238
 
239
- if __name__ == "__main__":
240
  import uvicorn
241
  uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=False)
 
 
 
 
16
  If omitted, "default" is used (fine for sequential single-agent runs).
17
  """
18
 
19
+ from typing import Dict, Optional, Union
20
 
21
  from fastapi import FastAPI, HTTPException, Query
22
  from pydantic import BaseModel
 
43
  # Session management
44
  # ─────────────────────────────────────────────────────────────────────────────
45
 
46
+ _sessions: Dict[str, Union[Task1Environment, Task2Environment, Task3Environment]] = {}
47
  DEFAULT_SESSION = "default"
48
 
49
  TASK_ENV_MAP = {
 
81
  # Routes
82
  # ─────────────────────────────────────────────────────────────────────────────
83
 
84
+ @app.get("/")
85
+ def root():
86
+ return {"message": "Welcome to the Smart Contract Audit RL Environment! Visit README.md for documentation."}
87
+
88
  @app.get("/health")
89
  def health():
90
  """Liveness probe."""
 
240
  # Entry point
241
  # ─────────────────────────────────────────────────────────────────────────────
242
 
243
+ def main():
244
  import uvicorn
245
  uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=False)
246
+
247
+ if __name__ == "__main__":
248
+ main()
server/tasks/__init__.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # tasks package
2
+ from server.tasks.task1 import Task1Environment, Task1Grader
3
+ from server.tasks.task2 import Task2Environment, Task2Grader
4
+ from server.tasks.task3 import Task3Environment, Task3Grader
5
+
6
+ __all__ = [
7
+ "Task1Environment",
8
+ "Task1Grader",
9
+ "Task2Environment",
10
+ "Task2Grader",
11
+ "Task3Environment",
12
+ "Task3Grader",
13
+ ]
{tasks β†’ server/tasks}/task1/__init__.py RENAMED
@@ -1,5 +1,5 @@
1
  # task1 package
2
- from tasks.task1.environment import Task1Environment
3
- from tasks.task1.grader import Task1Grader
4
 
5
  __all__ = ["Task1Environment", "Task1Grader"]
 
1
  # task1 package
2
+ from server.tasks.task1.environment import Task1Environment
3
+ from server.tasks.task1.grader import Task1Grader
4
 
5
  __all__ = ["Task1Environment", "Task1Grader"]
{tasks β†’ server/tasks}/task1/actions.py RENAMED
File without changes
{tasks β†’ server/tasks}/task1/environment.py RENAMED
@@ -38,8 +38,8 @@ from env.schemas import (
38
  StateResult,
39
  StepResult,
40
  )
41
- from tasks.task1.grader import Task1Grader
42
- from tasks.task1 import actions
43
 
44
  TASK_ID = "task1_vuln_detection"
45
 
 
38
  StateResult,
39
  StepResult,
40
  )
41
+ from server.tasks.task1 import actions
42
+ from .grader import Task1Grader
43
 
44
  TASK_ID = "task1_vuln_detection"
45
 
{tasks β†’ server/tasks}/task1/grader.py RENAMED
File without changes
{tasks β†’ server/tasks}/task2/__init__.py RENAMED
@@ -1,5 +1,5 @@
1
  # Task 2: Property Discovery
2
- from tasks.task2.environment import Task2Environment
3
- from tasks.task2.grader import Task2Grader
4
 
5
  __all__ = ["Task2Environment", "Task2Grader"]
 
1
  # Task 2: Property Discovery
2
+ from server.tasks.task2.environment import Task2Environment
3
+ from server.tasks.task2.grader import Task2Grader
4
 
5
  __all__ = ["Task2Environment", "Task2Grader"]
{tasks β†’ server/tasks}/task2/actions.py RENAMED
File without changes
{tasks β†’ server/tasks}/task2/environment.py RENAMED
@@ -39,8 +39,8 @@ from env.schemas import (
39
  StateResult,
40
  StepResult,
41
  )
42
- from tasks.task2.grader import Task2Grader
43
- from tasks.task2 import actions
44
 
45
  TASK_ID = "task2_property_discovery"
46
  MAX_STEPS = 15
 
39
  StateResult,
40
  StepResult,
41
  )
42
+ from .grader import Task2Grader
43
+ from server.tasks.task2 import actions
44
 
45
  TASK_ID = "task2_property_discovery"
46
  MAX_STEPS = 15
{tasks β†’ server/tasks}/task2/grader.py RENAMED
File without changes
{tasks β†’ server/tasks}/task3/__init__.py RENAMED
@@ -1,5 +1,5 @@
1
  # Task 3: Rule Checker
2
- from tasks.task3.environment import Task3Environment
3
- from tasks.task3.grader import Task3Grader
4
 
5
  __all__ = ["Task3Environment", "Task3Grader"]
 
1
  # Task 3: Rule Checker
2
+ from server.tasks.task3.environment import Task3Environment
3
+ from server.tasks.task3.grader import Task3Grader
4
 
5
  __all__ = ["Task3Environment", "Task3Grader"]
{tasks β†’ server/tasks}/task3/actions.py RENAMED
File without changes
{tasks β†’ server/tasks}/task3/environment.py RENAMED
@@ -35,7 +35,6 @@ from __future__ import annotations
35
 
36
  import random
37
  from typing import Any, Dict, List, Optional, Set
38
- from tasks.task3 import actions
39
 
40
  from data.data_loader import load_contracts, sample_task3_episode
41
  from env.base_env import BaseEnv
@@ -48,7 +47,8 @@ from env.schemas import (
48
  StateResult,
49
  StepResult,
50
  )
51
- from tasks.task3.grader import Task3Grader
 
52
 
53
  TASK_ID = "task3_rule_checker"
54
  MAX_STEPS = 15
 
35
 
36
  import random
37
  from typing import Any, Dict, List, Optional, Set
 
38
 
39
  from data.data_loader import load_contracts, sample_task3_episode
40
  from env.base_env import BaseEnv
 
47
  StateResult,
48
  StepResult,
49
  )
50
+ from .grader import Task3Grader
51
+ from server.tasks.task3 import actions
52
 
53
  TASK_ID = "task3_rule_checker"
54
  MAX_STEPS = 15
{tasks β†’ server/tasks}/task3/grader.py RENAMED
File without changes
tasks/__init__.py DELETED
@@ -1 +0,0 @@
1
- # tasks package
 
 
uv.lock ADDED
The diff for this file is too large to render. See raw diff
 
validate-submission.sh ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ #
3
+ # validate-submission.sh β€” OpenEnv Submission Validator
4
+ #
5
+ # Checks that your HF Space is live, Docker image builds, and openenv validate passes.
6
+ #
7
+ # Prerequisites:
8
+ # - Docker: https://docs.docker.com/get-docker/
9
+ # - openenv-core: pip install openenv-core
10
+ # - curl (usually pre-installed)
11
+ #
12
+ # Run:
13
+ # curl -fsSL https://raw.githubusercontent.com/<owner>/<repo>/main/scripts/validate-submission.sh | bash -s -- <ping_url> [repo_dir]
14
+ #
15
+ # Or download and run locally:
16
+ # chmod +x validate-submission.sh
17
+ # ./validate-submission.sh <ping_url> [repo_dir]
18
+ #
19
+ # Arguments:
20
+ # ping_url Your HuggingFace Space URL (e.g. https://your-space.hf.space)
21
+ # repo_dir Path to your repo (default: current directory)
22
+ #
23
+ # Examples:
24
+ # ./validate-submission.sh https://my-team.hf.space
25
+ # ./validate-submission.sh https://my-team.hf.space ./my-repo
26
+ #
27
+
28
+ set -uo pipefail
29
+
30
+ DOCKER_BUILD_TIMEOUT=600
31
+ if [ -t 1 ]; then
32
+ RED='\033[0;31m'
33
+ GREEN='\033[0;32m'
34
+ YELLOW='\033[1;33m'
35
+ BOLD='\033[1m'
36
+ NC='\033[0m'
37
+ else
38
+ RED='' GREEN='' YELLOW='' BOLD='' NC=''
39
+ fi
40
+
41
+ run_with_timeout() {
42
+ local secs="$1"; shift
43
+ if command -v timeout &>/dev/null; then
44
+ timeout "$secs" "$@"
45
+ elif command -v gtimeout &>/dev/null; then
46
+ gtimeout "$secs" "$@"
47
+ else
48
+ "$@" &
49
+ local pid=$!
50
+ ( sleep "$secs" && kill "$pid" 2>/dev/null ) &
51
+ local watcher=$!
52
+ wait "$pid" 2>/dev/null
53
+ local rc=$?
54
+ kill "$watcher" 2>/dev/null
55
+ wait "$watcher" 2>/dev/null
56
+ return $rc
57
+ fi
58
+ }
59
+
60
+ portable_mktemp() {
61
+ local prefix="${1:-validate}"
62
+ mktemp "${TMPDIR:-/tmp}/${prefix}-XXXXXX" 2>/dev/null || mktemp
63
+ }
64
+
65
+ CLEANUP_FILES=()
66
+ cleanup() { rm -f "${CLEANUP_FILES[@]+"${CLEANUP_FILES[@]}"}"; }
67
+ trap cleanup EXIT
68
+
69
+ PING_URL="${1:-}"
70
+ REPO_DIR="${2:-.}"
71
+
72
+ if [ -z "$PING_URL" ]; then
73
+ printf "Usage: %s <ping_url> [repo_dir]\n" "$0"
74
+ printf "\n"
75
+ printf " ping_url Your HuggingFace Space URL (e.g. https://your-space.hf.space)\n"
76
+ printf " repo_dir Path to your repo (default: current directory)\n"
77
+ exit 1
78
+ fi
79
+
80
+ if ! REPO_DIR="$(cd "$REPO_DIR" 2>/dev/null && pwd)"; then
81
+ printf "Error: directory '%s' not found\n" "${2:-.}"
82
+ exit 1
83
+ fi
84
+ PING_URL="${PING_URL%/}"
85
+ export PING_URL
86
+ PASS=0
87
+
88
+ log() { printf "[%s] %b\n" "$(date -u +%H:%M:%S)" "$*"; }
89
+ pass() { log "${GREEN}PASSED${NC} -- $1"; PASS=$((PASS + 1)); }
90
+ fail() { log "${RED}FAILED${NC} -- $1"; }
91
+ hint() { printf " ${YELLOW}Hint:${NC} %b\n" "$1"; }
92
+ stop_at() {
93
+ printf "\n"
94
+ printf "${RED}${BOLD}Validation stopped at %s.${NC} Fix the above before continuing.\n" "$1"
95
+ exit 1
96
+ }
97
+
98
+ printf "\n"
99
+ printf "${BOLD}========================================${NC}\n"
100
+ printf "${BOLD} OpenEnv Submission Validator${NC}\n"
101
+ printf "${BOLD}========================================${NC}\n"
102
+ log "Repo: $REPO_DIR"
103
+ log "Ping URL: $PING_URL"
104
+ printf "\n"
105
+
106
+ log "${BOLD}Step 1/3: Pinging HF Space${NC} ($PING_URL/reset) ..."
107
+
108
+ CURL_OUTPUT=$(portable_mktemp "validate-curl")
109
+ CLEANUP_FILES+=("$CURL_OUTPUT")
110
+ HTTP_CODE=$(curl -s -o "$CURL_OUTPUT" -w "%{http_code}" -X POST \
111
+ -H "Content-Type: application/json" -d '{}' \
112
+ "$PING_URL/reset" --max-time 30 2>"$CURL_OUTPUT" || printf "000")
113
+
114
+ if [ "$HTTP_CODE" = "200" ]; then
115
+ pass "HF Space is live and responds to /reset"
116
+ elif [ "$HTTP_CODE" = "000" ]; then
117
+ fail "HF Space not reachable (connection failed or timed out)"
118
+ hint "Check your network connection and that the Space is running."
119
+ hint "Try: curl -s -o /dev/null -w '%%{http_code}' -X POST $PING_URL/reset"
120
+ stop_at "Step 1"
121
+ else
122
+ fail "HF Space /reset returned HTTP $HTTP_CODE (expected 200)"
123
+ hint "Make sure your Space is running and the URL is correct."
124
+ hint "Try opening $PING_URL in your browser first."
125
+ stop_at "Step 1"
126
+ fi
127
+
128
+ log "${BOLD}Step 2/3: Running docker build${NC} ..."
129
+
130
+ if ! command -v docker &>/dev/null; then
131
+ fail "docker command not found"
132
+ hint "Install Docker: https://docs.docker.com/get-docker/"
133
+ stop_at "Step 2"
134
+ fi
135
+
136
+ if [ -f "$REPO_DIR/Dockerfile" ]; then
137
+ DOCKER_CONTEXT="$REPO_DIR"
138
+ elif [ -f "$REPO_DIR/server/Dockerfile" ]; then
139
+ DOCKER_CONTEXT="$REPO_DIR/server"
140
+ else
141
+ fail "No Dockerfile found in repo root or server/ directory"
142
+ stop_at "Step 2"
143
+ fi
144
+
145
+ log " Found Dockerfile in $DOCKER_CONTEXT"
146
+
147
+ BUILD_OK=false
148
+ BUILD_OUTPUT=$(run_with_timeout "$DOCKER_BUILD_TIMEOUT" docker build "$DOCKER_CONTEXT" 2>&1) && BUILD_OK=true
149
+
150
+ if [ "$BUILD_OK" = true ]; then
151
+ pass "Docker build succeeded"
152
+ else
153
+ fail "Docker build failed (timeout=${DOCKER_BUILD_TIMEOUT}s)"
154
+ printf "%s\n" "$BUILD_OUTPUT" | tail -20
155
+ stop_at "Step 2"
156
+ fi
157
+
158
+ log "${BOLD}Step 3/3: Running openenv validate${NC} ..."
159
+
160
+ if ! command -v openenv &>/dev/null; then
161
+ fail "openenv command not found"
162
+ hint "Install it: pip install openenv-core"
163
+ stop_at "Step 3"
164
+ fi
165
+
166
+ VALIDATE_OK=false
167
+ VALIDATE_OUTPUT=$(cd "$REPO_DIR" && openenv validate 2>&1) && VALIDATE_OK=true
168
+
169
+ if [ "$VALIDATE_OK" = true ]; then
170
+ pass "openenv validate passed"
171
+ [ -n "$VALIDATE_OUTPUT" ] && log " $VALIDATE_OUTPUT"
172
+ else
173
+ fail "openenv validate failed"
174
+ printf "%s\n" "$VALIDATE_OUTPUT"
175
+ stop_at "Step 3"
176
+ fi
177
+
178
+ printf "\n"
179
+ printf "${BOLD}========================================${NC}\n"
180
+ printf "${GREEN}${BOLD} All 3/3 checks passed!${NC}\n"
181
+ printf "${GREEN}${BOLD} Your submission is ready to submit.${NC}\n"
182
+ printf "${BOLD}========================================${NC}\n"
183
+ printf "\n"
184
+
185
+ exit 0
validate.py DELETED
@@ -1,302 +0,0 @@
1
- """
2
- validate.py
3
- -----------
4
- Pre-submission validation β€” 24 checks across all three tasks.
5
- Usage: python validate.py
6
- Exit 0 = all pass. Exit 1 = failures.
7
- """
8
-
9
- import json, sys
10
- from typing import Callable, List, Tuple
11
-
12
- PASS = "βœ…"; FAIL = "❌"
13
- results: List[Tuple[str, bool, str]] = []
14
-
15
- def check(name: str, fn: Callable) -> None:
16
- try:
17
- fn(); results.append((name, True, ""))
18
- print(f" {PASS} {name}")
19
- except Exception as e:
20
- results.append((name, False, str(e)))
21
- print(f" {FAIL} {name}\n {e}")
22
-
23
- # ── Checks ────────────────────────────────────────────────────────────────────
24
-
25
- def check_imports():
26
- from env.schemas import Observation, Action, Reward, StepResult, ResetResult, StateResult, ActionType
27
- from tasks.task1.environment import Task1Environment; from tasks.task1.grader import Task1Grader
28
- from tasks.task2.environment import Task2Environment; from tasks.task2.grader import Task2Grader
29
- from tasks.task3.environment import Task3Environment; from tasks.task3.grader import Task3Grader
30
- from data.data_loader import load_contracts
31
-
32
- def check_openenv_yaml():
33
- import yaml
34
- with open("openenv.yaml") as f: spec = yaml.safe_load(f)
35
- assert "name" in spec and len(spec.get("tasks", [])) >= 3
36
- assert "observation_space" in spec and "action_space" in spec and "reward" in spec
37
- tasks = spec["tasks"]
38
- active = [t for t in tasks if t.get("status") == "active"]
39
- assert len(active) >= 2, f"Expected >=2 active tasks, got {len(active)}"
40
-
41
- def check_pydantic_models():
42
- from env.schemas import Observation, Action, ActionType, Reward, StepResult, ResetResult
43
- obs = Observation(task_id="t", contract_name="C", contract_description="D", available_actions=[])
44
- for at in [ActionType.LIST_FUNCTIONS, ActionType.SUBMIT_PROPERTY,
45
- ActionType.GET_PROPERTY_SPECIFICATION, ActionType.SUBMIT_FUNCTION]:
46
- Action(action_type=at)
47
- Reward(value=-1.5, reason="test")
48
- StepResult(observation=obs, reward=Reward(value=0, reason=""), done=False)
49
-
50
- def check_data_loading():
51
- from data.data_loader import (load_contracts, get_all_vulnerable_entries,
52
- get_all_property_entries, get_all_task3_entries)
53
- c = load_contracts()
54
- assert len(get_all_vulnerable_entries(c)) >= 3
55
- assert len(get_all_property_entries(c)) >= 3
56
- entries = get_all_task3_entries(c)
57
- assert len(entries) >= 3, f"Need >=3 task3 entries, got {len(entries)}"
58
- for _, fn in entries:
59
- t3 = fn.get("task3", {})
60
- assert t3.get("property_english"), f"{fn['name']} missing property_english"
61
- assert t3.get("property_formal"), f"{fn['name']} missing property_formal"
62
-
63
- def check_t1_env():
64
- from tasks.task1.environment import Task1Environment
65
- from env.schemas import Action, ActionType
66
- env = Task1Environment()
67
- r = env.reset(seed=42); assert r.observation.task_id == "task1_vuln_detection"
68
- s = env.step(Action(action_type=ActionType.LIST_FUNCTIONS))
69
- assert s.reward.value == -0.05 and s.observation.step_count == 1
70
- assert env.state().target_function is not None
71
-
72
- def check_t2_env():
73
- from tasks.task2.environment import Task2Environment
74
- from env.schemas import Action, ActionType
75
- env = Task2Environment()
76
- r = env.reset(seed=42)
77
- assert r.observation.task_id == "task2_property_discovery"
78
- assert "target_function" in r.observation.extra
79
- for at in [ActionType.GET_FUNCTION_CODE, ActionType.GET_FUNCTION_NATSPEC,
80
- ActionType.GET_FILE_NATSPEC, ActionType.GET_SIGNATURE,
81
- ActionType.GET_RELATED_FUNCTIONS, ActionType.GET_SIMILAR_RULE]:
82
- env.step(Action(action_type=at))
83
-
84
- def check_t3_env():
85
- from tasks.task3.environment import Task3Environment
86
- from env.schemas import Action, ActionType
87
- env = Task3Environment()
88
- r = env.reset(seed=42)
89
- assert r.observation.task_id == "task3_rule_checker"
90
- assert "property_english" in r.observation.extra
91
- prop = r.observation.extra["property_english"]
92
- assert len(prop) > 10, "property_english too short"
93
- for at in [ActionType.LIST_FUNCTIONS, ActionType.GET_PROPERTY_SPECIFICATION,
94
- ActionType.GET_CALL_GRAPH, ActionType.GET_STATE_VARIABLE]:
95
- s = env.step(Action(action_type=at))
96
- assert s.reward.value < 0, f"{at.value} should have negative shaping reward"
97
-
98
- def check_t3_action_costs():
99
- from tasks.task3.environment import Task3Environment
100
- from env.schemas import Action, ActionType
101
- env = Task3Environment(); env.reset(seed=42)
102
- costs = {
103
- ActionType.GET_PROPERTY_SPECIFICATION: -0.03,
104
- ActionType.LIST_FUNCTIONS: -0.05,
105
- ActionType.GET_CALL_GRAPH: -0.08,
106
- }
107
- for at, expected in costs.items():
108
- e2 = Task3Environment(); e2.reset(seed=42)
109
- s = e2.step(Action(action_type=at))
110
- assert abs(s.reward.value - expected) < 0.001, \
111
- f"{at.value}: expected {expected}, got {s.reward.value}"
112
-
113
- def check_t3_function_metadata():
114
- from tasks.task3.environment import Task3Environment
115
- from env.schemas import Action, ActionType
116
- env = Task3Environment(); env.reset(seed=43)
117
- s = env.step(Action(action_type=ActionType.GET_FUNCTION_METADATA,
118
- params={"function_name": "withdraw"}))
119
- assert "Visibility" in s.observation.last_action_result
120
- assert s.reward.value == -0.05
121
-
122
- def check_t3_submit_correct():
123
- from tasks.task3.environment import Task3Environment
124
- from env.schemas import Action, ActionType
125
- env = Task3Environment(); env.reset(seed=42)
126
- target = env.state().target_function
127
- s = env.step(Action(action_type=ActionType.SUBMIT_FUNCTION,
128
- params={"function_name": target}))
129
- assert s.done and s.reward.value == 5.0, \
130
- f"Expected reward=5.0, got {s.reward.value}"
131
-
132
- def check_t3_submit_subfunction():
133
- from tasks.task3.environment import Task3Environment
134
- from env.schemas import Action, ActionType
135
- # seed 45 β†’ bid with subfunction getPrice
136
- env = Task3Environment(); env.reset(seed=45)
137
- assert env.state().target_function == "bid"
138
- s = env.step(Action(action_type=ActionType.SUBMIT_FUNCTION,
139
- params={"function_name": "getPrice"}))
140
- assert s.done and s.reward.value == 1.5, \
141
- f"Expected partial reward=1.5, got {s.reward.value}"
142
-
143
- def check_t3_submit_wrong():
144
- from tasks.task3.environment import Task3Environment
145
- from env.schemas import Action, ActionType
146
- env = Task3Environment(); env.reset(seed=42)
147
- s = env.step(Action(action_type=ActionType.SUBMIT_FUNCTION,
148
- params={"function_name": "constructor"}))
149
- assert s.done and s.reward.value == -1.5
150
-
151
- def check_t3_one_submit_only():
152
- from tasks.task3.environment import Task3Environment
153
- from env.schemas import Action, ActionType
154
- env = Task3Environment(); env.reset(seed=42)
155
- env.step(Action(action_type=ActionType.SUBMIT_FUNCTION,
156
- params={"function_name": "deposit"}))
157
- try:
158
- env.step(Action(action_type=ActionType.LIST_FUNCTIONS))
159
- raise AssertionError("Should raise RuntimeError after done")
160
- except RuntimeError:
161
- pass
162
-
163
- def check_t3_repeated_penalty():
164
- from tasks.task3.environment import Task3Environment
165
- from env.schemas import Action, ActionType
166
- env = Task3Environment(); env.reset(seed=42)
167
- env.step(Action(action_type=ActionType.LIST_FUNCTIONS))
168
- s = env.step(Action(action_type=ActionType.LIST_FUNCTIONS))
169
- assert s.reward.value == -0.40
170
-
171
- def check_t1_grader():
172
- from tasks.task1.grader import Task1Grader
173
- g = Task1Grader("withdraw", "Reentrancy vulnerability")
174
- assert g.grade_submission("withdraw", "reentrancy") == 1.0
175
- assert g.grade_submission("withdraw", "vague") == 0.5
176
- assert g.grade_submission("deposit", "reentrancy") == 0.0
177
-
178
- def check_t2_grader():
179
- from tasks.task2.grader import Task2Grader
180
- from data.data_loader import load_contracts, get_all_property_entries
181
- for c, fn in get_all_property_entries(load_contracts()):
182
- g = Task2Grader(fn["name"], fn["property"])
183
- assert g.grade(fn["property"])[0] >= 0.65
184
- assert g.grade("") == 0.0
185
- s = g.grade("test"); assert s == g.grade("test") # deterministic
186
-
187
- def check_t3_grader():
188
- from tasks.task3.grader import Task3Grader
189
- g = Task3Grader("withdraw", ["deposit"], "some rule")
190
- assert g.grade("withdraw") == 1.0
191
- assert g.grade("WITHDRAW") == 1.0 # case-insensitive
192
- assert g.grade("deposit") == 0.3
193
- assert g.grade("constructor") == 0.0
194
- s, r = g.grade_and_reward("withdraw"); assert s == 1.0 and r == 5.0
195
- s, r = g.grade_and_reward("deposit"); assert s == 0.3 and r == 1.5
196
- s, r = g.grade_and_reward("other"); assert s == 0.0 and r == -1.5
197
-
198
- def check_reward_shaping():
199
- from tasks.task3.environment import Task3Environment
200
- from env.schemas import Action, ActionType
201
- env = Task3Environment(); env.reset(seed=1)
202
- rewards = {env.step(Action(action_type=at)).reward.value
203
- for at in [ActionType.LIST_FUNCTIONS,
204
- ActionType.GET_PROPERTY_SPECIFICATION,
205
- ActionType.GET_CALL_GRAPH]}
206
- assert len(rewards) >= 2
207
-
208
- def check_app_imports():
209
- from app import app
210
- from fastapi.testclient import TestClient
211
- client = TestClient(app)
212
- r = client.get("/health"); assert r.status_code == 200
213
- tasks = client.get("/tasks").json()["tasks"]
214
- active = [t for t in tasks if t["status"] == "active"]
215
- assert len(active) == 3, f"Expected 3 active tasks, got {len(active)}: {active}"
216
-
217
- def check_t3_http_reset():
218
- from app import app
219
- from fastapi.testclient import TestClient
220
- client = TestClient(app)
221
- r = client.post("/reset", json={"task_id": "task3_rule_checker", "seed": 42})
222
- assert r.status_code == 200
223
- obs = r.json()["observation"]
224
- assert obs["task_id"] == "task3_rule_checker"
225
- assert "property_english" in obs["extra"]
226
-
227
- def check_dockerfile():
228
- import os
229
- assert os.path.exists("Dockerfile")
230
- c = open("Dockerfile").read()
231
- assert "7860" in c and ("uvicorn" in c or "CMD" in c)
232
-
233
- def check_inference_script():
234
- import os
235
- assert os.path.exists("inference.py")
236
- c = open("inference.py").read()
237
- assert "HF_TOKEN" in c and "API_BASE_URL" in c and "MODEL_NAME" in c
238
- assert "Task3Environment" in c or "run_task3" in c
239
- assert "submit_function" in c
240
-
241
- def check_baseline_json():
242
- import os
243
- if not os.path.exists("baseline_scores.json"): return
244
- data = json.load(open("baseline_scores.json"))
245
- for t in data.get("tasks", []):
246
- assert 0.0 <= t["avg_grader_score"] <= 1.0
247
-
248
- # ── Runner ────────────────────────────────────────────────────────────────────
249
-
250
- ALL_CHECKS = [
251
- ("Python imports (T1+T2+T3)", check_imports),
252
- ("openenv.yaml: 3 tasks, β‰₯2 active", check_openenv_yaml),
253
- ("Pydantic models (all ActionTypes)", check_pydantic_models),
254
- ("Dataset: vuln+property+task3 entries",check_data_loading),
255
- ("T1 env: reset/step/state", check_t1_env),
256
- ("T2 env: reset + 6 browse actions", check_t2_env),
257
- ("T3 env: reset + browse actions", check_t3_env),
258
- ("T3 action costs (formalized -0.03)", check_t3_action_costs),
259
- ("T3 get_function_metadata", check_t3_function_metadata),
260
- ("T3 submit correct β†’ +5.0", check_t3_submit_correct),
261
- ("T3 submit subfunction β†’ +1.5", check_t3_submit_subfunction),
262
- ("T3 submit wrong β†’ -1.5", check_t3_submit_wrong),
263
- ("T3 one submit per episode", check_t3_one_submit_only),
264
- ("T3 repeated query β†’ -0.40", check_t3_repeated_penalty),
265
- ("T1 grader: 0/0.5/1.0 rubric", check_t1_grader),
266
- ("T2 grader: all 11 properties", check_t2_grader),
267
- ("T3 grader: 1.0/0.3/0.0 + case-ins.", check_t3_grader),
268
- ("Reward shaping non-binary (T3)", check_reward_shaping),
269
- ("FastAPI: 3 active tasks", check_app_imports),
270
- ("FastAPI: T3 reset endpoint", check_t3_http_reset),
271
- ("Dockerfile + port 7860", check_dockerfile),
272
- ("inference.py: T3 code present", check_inference_script),
273
- ("baseline_scores.json schema", check_baseline_json),
274
- ]
275
-
276
- def main():
277
- print("=" * 64)
278
- print("OpenEnv Pre-Submission Validation (Task 1 + 2 + 3)")
279
- print("=" * 64)
280
- print()
281
- for name, fn in ALL_CHECKS:
282
- check(name, fn)
283
-
284
- passed = sum(1 for _, ok, _ in results if ok)
285
- total = len(results)
286
- failed = [(n, m) for n, ok, m in results if not ok]
287
-
288
- print()
289
- print("=" * 64)
290
- print(f"Results: {passed}/{total} checks passed")
291
- if failed:
292
- print("\nFailed checks:")
293
- for n, m in failed:
294
- print(f" {FAIL} {n}: {m}")
295
- print("\n❌ VALIDATION FAILED")
296
- sys.exit(1)
297
- else:
298
- print("\nβœ… ALL CHECKS PASSED β€” ready to submit!")
299
- sys.exit(0)
300
-
301
- if __name__ == "__main__":
302
- main()