arnavzz commited on
Commit
51133cf
·
1 Parent(s): 3faaaa0

feat: add pre-submission validation script (36/36 checks pass)

Browse files
Files changed (1) hide show
  1. validate_submission.py +180 -0
validate_submission.py ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Pre-submission validation script for the OpenEnv Code Debug environment.
3
+ Checks all items on the hackathon pre-submission checklist.
4
+ """
5
+
6
+ import json
7
+ import os
8
+ import sys
9
+ import subprocess
10
+ from pathlib import Path
11
+
12
+ import httpx
13
+
14
+ HF_SPACE_URL = "https://arnavk-openenv-code-debugger.hf.space"
15
+ ROOT = Path(__file__).parent
16
+
17
+ PASS = "\033[92m[PASS]\033[0m"
18
+ FAIL = "\033[91m[FAIL]\033[0m"
19
+ WARN = "\033[93m[WARN]\033[0m"
20
+
21
+ results = []
22
+
23
+ def check(name, ok, detail=""):
24
+ status = PASS if ok else FAIL
25
+ print(f"{status} {name}" + (f" — {detail}" if detail else ""))
26
+ results.append(ok)
27
+
28
+
29
+ # ------------------------------------------------------------------
30
+ # 1. Local file structure
31
+ # ------------------------------------------------------------------
32
+ print("\n=== File Structure ===")
33
+ check("inference.py at root", (ROOT / "inference.py").exists())
34
+ check("Dockerfile at root", (ROOT / "Dockerfile").exists())
35
+ check("openenv.yaml exists", (ROOT / "code_debug_env" / "openenv.yaml").exists())
36
+ check("models.py exists", (ROOT / "code_debug_env" / "models.py").exists())
37
+ check("server/app.py exists", (ROOT / "code_debug_env" / "server" / "app.py").exists())
38
+ check("server/environment.py exists", (ROOT / "code_debug_env" / "server" / "environment.py").exists())
39
+ check("server/executor.py exists", (ROOT / "code_debug_env" / "server" / "executor.py").exists())
40
+
41
+ tasks_dir = ROOT / "code_debug_env" / "tasks"
42
+ task_files = list(tasks_dir.rglob("*.json"))
43
+ check("3+ task files", len(task_files) >= 3, f"{len(task_files)} found")
44
+
45
+ difficulties = set()
46
+ for tf in task_files:
47
+ t = json.loads(tf.read_text())
48
+ difficulties.add(t.get("difficulty"))
49
+ check("All 3 difficulty levels present", {"easy", "medium", "hard"}.issubset(difficulties), str(difficulties))
50
+
51
+ # ------------------------------------------------------------------
52
+ # 2. inference.py content checks
53
+ # ------------------------------------------------------------------
54
+ print("\n=== inference.py Content ===")
55
+ inf = (ROOT / "inference.py").read_text()
56
+ check("Uses OpenAI client", "from openai import OpenAI" in inf)
57
+ check("Reads API_BASE_URL from env", "API_BASE_URL" in inf)
58
+ check("Reads MODEL_NAME from env", "MODEL_NAME" in inf)
59
+ check("Reads HF_TOKEN from env", "HF_TOKEN" in inf)
60
+ check("[START] log line", "[START]" in inf)
61
+ check("[STEP] log line", "[STEP]" in inf)
62
+ check("[END] log line", "[END]" in inf)
63
+
64
+ # ------------------------------------------------------------------
65
+ # 3. HF Space liveness
66
+ # ------------------------------------------------------------------
67
+ print("\n=== HF Space Liveness ===")
68
+ try:
69
+ r = httpx.get(f"{HF_SPACE_URL}/health", timeout=30)
70
+ check("HF Space returns HTTP 200", r.status_code == 200, f"status={r.status_code}")
71
+ data = r.json()
72
+ check("Health response is healthy", data.get("status") == "healthy", str(data))
73
+ except Exception as e:
74
+ check("HF Space reachable", False, str(e))
75
+
76
+ # ------------------------------------------------------------------
77
+ # 4. reset() responds
78
+ # ------------------------------------------------------------------
79
+ print("\n=== reset() / step() Endpoints ===")
80
+ try:
81
+ r = httpx.post(f"{HF_SPACE_URL}/reset", json={}, timeout=30)
82
+ check("POST /reset returns 200", r.status_code == 200)
83
+ ep = r.json().get("episode_id")
84
+ check("reset() returns episode_id", bool(ep))
85
+
86
+ obs = r.json().get("observation", {})
87
+ check("observation has buggy_code", bool(obs.get("buggy_code")))
88
+ check("observation has test_descriptions", bool(obs.get("test_descriptions")))
89
+ check("observation has difficulty", bool(obs.get("difficulty")))
90
+ except Exception as e:
91
+ check("reset() works", False, str(e))
92
+ ep = None
93
+
94
+ # ------------------------------------------------------------------
95
+ # 5. step() returns reward in 0.0–1.0
96
+ # ------------------------------------------------------------------
97
+ if ep:
98
+ try:
99
+ r2 = httpx.post(f"{HF_SPACE_URL}/step/{ep}",
100
+ json={"action": {"code": "def placeholder(): pass"}},
101
+ timeout=30)
102
+ check("POST /step returns 200", r2.status_code == 200)
103
+ d = r2.json()
104
+ reward = d.get("reward", -1)
105
+ check("reward in [0.0, 1.0]", 0.0 <= reward <= 1.0, f"reward={reward}")
106
+ check("done field is bool", isinstance(d.get("done"), bool))
107
+ except Exception as e:
108
+ check("step() works", False, str(e))
109
+
110
+ # ------------------------------------------------------------------
111
+ # 6. state() endpoint
112
+ # ------------------------------------------------------------------
113
+ if ep:
114
+ try:
115
+ r3 = httpx.get(f"{HF_SPACE_URL}/state/{ep}", timeout=30)
116
+ check("GET /state returns 200", r3.status_code == 200)
117
+ s = r3.json()
118
+ check("state has episode_id", bool(s.get("episode_id")))
119
+ check("state has step_count", "step_count" in s)
120
+ except Exception as e:
121
+ check("state() works", False, str(e))
122
+
123
+ # ------------------------------------------------------------------
124
+ # 7. Tasks enumeration
125
+ # ------------------------------------------------------------------
126
+ print("\n=== Task Enumeration ===")
127
+ try:
128
+ r4 = httpx.get(f"{HF_SPACE_URL}/tasks", timeout=30)
129
+ check("GET /tasks returns 200", r4.status_code == 200)
130
+ tasks = r4.json()
131
+ check("3+ tasks listed", len(tasks) >= 3, f"{len(tasks)} tasks")
132
+ task_difficulties = {t["difficulty"] for t in tasks}
133
+ check("All difficulties present in tasks endpoint", {"easy","medium","hard"}.issubset(task_difficulties))
134
+ except Exception as e:
135
+ check("tasks endpoint works", False, str(e))
136
+
137
+ # ------------------------------------------------------------------
138
+ # 8. inference.py log format check (dry run on one task)
139
+ # ------------------------------------------------------------------
140
+ print("\n=== inference.py Log Format ===")
141
+ env = os.environ.copy()
142
+ env.update({
143
+ "API_BASE_URL": "https://router.huggingface.co/v1",
144
+ "MODEL_NAME": "Qwen/Qwen2.5-72B-Instruct",
145
+ "HF_TOKEN": os.getenv("HF_TOKEN", ""),
146
+ "ENV_URL": HF_SPACE_URL,
147
+ })
148
+
149
+ try:
150
+ proc = subprocess.run(
151
+ [sys.executable, str(ROOT / "inference.py")],
152
+ capture_output=True, text=True, timeout=300, env=env
153
+ )
154
+ output = proc.stdout
155
+ has_start = any(line.startswith("[START]") for line in output.splitlines())
156
+ has_step = any(line.startswith("[STEP]") for line in output.splitlines())
157
+ has_end = any(line.startswith("[END]") for line in output.splitlines())
158
+ check("[START] line emitted", has_start)
159
+ check("[STEP] line emitted", has_step)
160
+ check("[END] line emitted", has_end)
161
+ check("inference.py exits cleanly", proc.returncode == 0, f"exit={proc.returncode}")
162
+ if proc.returncode != 0 and proc.stderr:
163
+ print(f" stderr: {proc.stderr[:300]}")
164
+ except subprocess.TimeoutExpired:
165
+ check("inference.py completes within 5 min", False, "timed out")
166
+ except Exception as e:
167
+ check("inference.py runs", False, str(e))
168
+
169
+ # ------------------------------------------------------------------
170
+ # Summary
171
+ # ------------------------------------------------------------------
172
+ print("\n=== Summary ===")
173
+ passed = sum(results)
174
+ total = len(results)
175
+ print("".join(["PASS" if r else "FAIL" for r in results]))
176
+ print(f"{passed}/{total} checks passed")
177
+ if passed == total:
178
+ print("\n[READY] All checks passed - ready to submit!")
179
+ else:
180
+ print(f"\n[ACTION NEEDED] Fix {total - passed} failing check(s) before submitting.")