shank commited on
Commit
ee08016
Β·
1 Parent(s): 9940e16

Cleaner code and logic improvement

Browse files
.gitignore ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python cache β€” never commit these
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.pyc
6
+ *.pyo
7
+
8
+ # Virtual environments
9
+ .venv/
10
+ venv/
11
+
12
+ # Actually use this instead for venvs:
13
+ .venv/
14
+ venv/
15
+ .env/
16
+
17
+ # Environment variables β€” never commit secrets
18
+ .env
19
+ *.env
20
+ .env.local
21
+ .env.production
22
+
23
+ # Test and coverage output
24
+ .pytest_cache/
25
+ .coverage
26
+ htmlcov/
27
+ *.coverage
28
+
29
+ # Build artifacts
30
+ dist/
31
+ build/
32
+ *.egg-info/
33
+
34
+ # IDE files
35
+ .vscode/
36
+ .idea/
37
+ *.swp
38
+ *.swo
39
+ .DS_Store
40
+
41
+ # Baseline results (generated, not source)
42
+ baseline_results.json
43
+
44
+ # Temp sandbox files (should clean up but just in case)
45
+ sandbox_*.py
46
+ /tmp/sandbox_*
env/__pycache__/__init__.cpython-313.pyc CHANGED
Binary files a/env/__pycache__/__init__.cpython-313.pyc and b/env/__pycache__/__init__.cpython-313.pyc differ
 
env/__pycache__/environment.cpython-313.pyc CHANGED
Binary files a/env/__pycache__/environment.cpython-313.pyc and b/env/__pycache__/environment.cpython-313.pyc differ
 
env/__pycache__/models.cpython-313.pyc CHANGED
Binary files a/env/__pycache__/models.cpython-313.pyc and b/env/__pycache__/models.cpython-313.pyc differ
 
env/environment.py CHANGED
@@ -340,10 +340,15 @@ class DebuggerEnvironment:
340
 
341
  # Run grader
342
  grader = get_grader(self._task_config["task_id"])
 
 
 
 
 
343
  grader_score = grader.score(
344
  task_config=self._task_config,
345
  attempts=self._all_attempts,
346
- best_tests_passed=self._best_tests_passed,
347
  tests_total=self._task_config["tests_total"],
348
  attempts_used=self._attempts_used,
349
  max_attempts=self._task_config["max_attempts"],
 
340
 
341
  # Run grader
342
  grader = get_grader(self._task_config["task_id"])
343
+ agent_best_tests_passed = (
344
+ max((a.get("tests_passed", 0) for a in self._all_attempts), default=0)
345
+ if self._all_attempts else 0
346
+ )
347
+
348
  grader_score = grader.score(
349
  task_config=self._task_config,
350
  attempts=self._all_attempts,
351
+ best_tests_passed=agent_best_tests_passed,
352
  tests_total=self._task_config["tests_total"],
353
  attempts_used=self._attempts_used,
354
  max_attempts=self._task_config["max_attempts"],
env/graders/grader_hard.py CHANGED
@@ -1,48 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  """
2
  Grader Hard β€” Concurrent stress test scoring.
3
- Custom weights:
4
- 0.40 β€” original 8 tests pass
5
- 0.30 β€” concurrent stress test (1000 threads)
6
- 0.20 β€” hypothesis accuracy
 
7
  0.10 β€” efficiency bonus (solved within 5 attempts)
 
 
 
 
 
 
 
8
  """
9
 
10
- import threading
11
  from typing import List, Dict, Any
12
  from env.graders.base_grader import BaseGrader
 
 
 
 
 
 
 
 
13
 
 
 
14
 
 
 
 
 
 
 
 
 
 
 
15
  class HardGrader(BaseGrader):
16
 
17
  def _run_concurrent_stress_test(self, code: str) -> bool:
18
  """
19
- Run a 1000-thread concurrent stress test against the submitted code.
20
- Returns True if the counter ends at exactly 1000 after 1000 concurrent increments.
 
 
21
  """
22
- try:
23
- # Execute the code in an isolated namespace
24
- namespace = {}
25
- exec(code, namespace)
26
-
27
- CounterClass = namespace.get("ConnectionCounter")
28
- if CounterClass is None:
29
- return False
30
-
31
- counter = CounterClass()
32
- num_threads = 1000
33
-
34
- threads = [
35
- threading.Thread(target=counter.increment)
36
- for _ in range(num_threads)
37
- ]
38
- for t in threads:
39
- t.start()
40
- for t in threads:
41
- t.join(timeout=10)
42
-
43
- return counter.get_count() == num_threads
44
- except Exception:
45
  return False
 
46
 
47
  def score(
48
  self,
@@ -57,32 +172,44 @@ class HardGrader(BaseGrader):
57
  ground_truth = task_config["ground_truth"]
58
  keywords = ground_truth["hypothesis_keywords"]
59
 
60
- # 1. Original tests pass (weight: 0.40)
61
- test_pass_ratio = (best_tests_passed / tests_total) if tests_total > 0 else 0.0
62
- original_test_score = test_pass_ratio * 0.40
 
 
 
 
 
 
 
 
 
 
 
63
 
64
- # 2. Concurrent stress test (weight: 0.30)
65
- # Use the best attempt's code (highest tests_passed, then latest)
 
 
66
  concurrent_score = 0.0
67
  if attempts:
68
- # Find the best attempt
69
  best_attempt = max(
70
  attempts,
71
  key=lambda a: (a.get("tests_passed", 0), a.get("attempt_number", 0))
72
  )
73
- best_code = best_attempt.get("code_submitted", "")
 
74
  if best_code:
75
- # Run the stress test 3 times β€” must pass all 3 for full credit
76
  passes = sum(
77
  1 for _ in range(3)
78
  if self._run_concurrent_stress_test(best_code)
79
  )
80
  if passes == 3:
81
- concurrent_score = 0.30
82
  elif passes >= 1:
83
- concurrent_score = 0.15 # Partial β€” inconsistent fix
84
 
85
- # 3. Hypothesis accuracy (weight: 0.20)
86
  if hypotheses:
87
  matches = sum(
88
  1 for h in hypotheses
@@ -93,8 +220,10 @@ class HardGrader(BaseGrader):
93
  hypothesis_ratio = 0.0
94
  hypothesis_score = hypothesis_ratio * 0.20
95
 
96
- # 4. Efficiency bonus (weight: 0.10)
97
- efficiency_score = 0.10 if attempts_used <= 5 else 0.0
 
 
98
 
99
- total = original_test_score + concurrent_score + hypothesis_score + efficiency_score
100
- return self._clamp(total)
 
1
+ # """
2
+ # Grader Hard β€” Concurrent stress test scoring.
3
+ # Custom weights:
4
+ # 0.40 β€” original 8 tests pass
5
+ # 0.30 β€” concurrent stress test (1000 threads)
6
+ # 0.20 β€” hypothesis accuracy
7
+ # 0.10 β€” efficiency bonus (solved within 5 attempts)
8
+ # """
9
+
10
+ # import threading
11
+ # from typing import List, Dict, Any
12
+ # from env.graders.base_grader import BaseGrader
13
+
14
+
15
+ # class HardGrader(BaseGrader):
16
+
17
+ # def _run_concurrent_stress_test(self, code: str) -> bool:
18
+ # """
19
+ # Run a 1000-thread concurrent stress test against the submitted code.
20
+ # Returns True if the counter ends at exactly 1000 after 1000 concurrent increments.
21
+ # """
22
+ # try:
23
+ # # Execute the code in an isolated namespace
24
+ # namespace = {}
25
+ # exec(code, namespace)
26
+
27
+ # CounterClass = namespace.get("ConnectionCounter")
28
+ # if CounterClass is None:
29
+ # return False
30
+
31
+ # counter = CounterClass()
32
+ # num_threads = 1000
33
+
34
+ # threads = [
35
+ # threading.Thread(target=counter.increment)
36
+ # for _ in range(num_threads)
37
+ # ]
38
+ # for t in threads:
39
+ # t.start()
40
+ # for t in threads:
41
+ # t.join(timeout=10)
42
+
43
+ # return counter.get_count() == num_threads
44
+ # except Exception:
45
+ # return False
46
+
47
+ # def score(
48
+ # self,
49
+ # task_config: dict,
50
+ # attempts: List[Dict[str, Any]],
51
+ # best_tests_passed: int,
52
+ # tests_total: int,
53
+ # attempts_used: int,
54
+ # max_attempts: int,
55
+ # hypotheses: List[str],
56
+ # ) -> float:
57
+ # ground_truth = task_config["ground_truth"]
58
+ # keywords = ground_truth["hypothesis_keywords"]
59
+
60
+ # # 1. Original tests pass (weight: 0.40)
61
+ # test_pass_ratio = (best_tests_passed / tests_total) if tests_total > 0 else 0.0
62
+ # original_test_score = test_pass_ratio * 0.40
63
+
64
+ # # 2. Concurrent stress test (weight: 0.30)
65
+ # # Use the best attempt's code (highest tests_passed, then latest)
66
+ # concurrent_score = 0.0
67
+ # if attempts:
68
+ # # Find the best attempt
69
+ # best_attempt = max(
70
+ # attempts,
71
+ # key=lambda a: (a.get("tests_passed", 0), a.get("attempt_number", 0))
72
+ # )
73
+ # best_code = best_attempt.get("code_submitted", "")
74
+ # if best_code:
75
+ # # Run the stress test 3 times β€” must pass all 3 for full credit
76
+ # passes = sum(
77
+ # 1 for _ in range(3)
78
+ # if self._run_concurrent_stress_test(best_code)
79
+ # )
80
+ # if passes == 3:
81
+ # concurrent_score = 0.30
82
+ # elif passes >= 1:
83
+ # concurrent_score = 0.15 # Partial β€” inconsistent fix
84
+
85
+ # # 3. Hypothesis accuracy (weight: 0.20)
86
+ # if hypotheses:
87
+ # matches = sum(
88
+ # 1 for h in hypotheses
89
+ # if self._check_hypothesis_keywords(h, keywords, "any")
90
+ # )
91
+ # hypothesis_ratio = matches / len(hypotheses)
92
+ # else:
93
+ # hypothesis_ratio = 0.0
94
+ # hypothesis_score = hypothesis_ratio * 0.20
95
+
96
+ # # 4. Efficiency bonus (weight: 0.10)
97
+ # efficiency_score = 0.10 if attempts_used <= 5 else 0.0
98
+
99
+ # total = original_test_score + concurrent_score + hypothesis_score + efficiency_score
100
+ # return self._clamp(total)
101
+
102
+
103
  """
104
  Grader Hard β€” Concurrent stress test scoring.
105
+
106
+ Weights:
107
+ 0.40 β€” agent's submitted fix passes the original 8 sequential tests
108
+ 0.30 β€” agent's submitted fix passes a 1000-thread concurrent stress test
109
+ 0.20 β€” hypothesis accuracy (agent correctly identified race condition)
110
  0.10 β€” efficiency bonus (solved within 5 attempts)
111
+
112
+ Security: ALL code execution goes through execute_code() sandbox.
113
+ Never uses raw exec() or eval() on agent-submitted code.
114
+
115
+ Score floor fix: original_test_score uses only agent-submitted attempts,
116
+ NOT the initial buggy code. An agent that submits nothing
117
+ scores 0.0, not 0.40.
118
  """
119
 
 
120
  from typing import List, Dict, Any
121
  from env.graders.base_grader import BaseGrader
122
+ from env.sandbox import execute_code
123
+
124
+
125
+ # The concurrent stress test β€” written as a string and run through the sandbox.
126
+ # 1000 threads all calling increment() simultaneously.
127
+ # A correct fix must result in count == 1000 every single time.
128
+ _CONCURRENT_STRESS_TEST = """
129
+ import threading
130
 
131
+ counter = ConnectionCounter()
132
+ num_threads = 1000
133
 
134
+ threads = [threading.Thread(target=counter.increment) for _ in range(num_threads)]
135
+ for t in threads:
136
+ t.start()
137
+ for t in threads:
138
+ t.join()
139
+
140
+ result = counter.get_count()
141
+ assert result == num_threads, f"CONCURRENT FAIL: expected {num_threads}, got {result}"
142
+ print(f"CONCURRENT PASS: {result} == {num_threads}")
143
+ """
144
  class HardGrader(BaseGrader):
145
 
146
  def _run_concurrent_stress_test(self, code: str) -> bool:
147
  """
148
+ Run the concurrent stress test against agent-submitted code.
149
+ Routes through execute_code() sandbox β€” never uses raw exec().
150
+ Returns True only if the counter reaches exactly 1000 after
151
+ 1000 concurrent increments.
152
  """
153
+ output, timed_out, _ = execute_code(
154
+ code,
155
+ _CONCURRENT_STRESS_TEST,
156
+ allow_threading=True,
157
+ )
158
+ if timed_out:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
  return False
160
+ return "CONCURRENT PASS" in output and "CONCURRENT FAIL" not in output
161
 
162
  def score(
163
  self,
 
172
  ground_truth = task_config["ground_truth"]
173
  keywords = ground_truth["hypothesis_keywords"]
174
 
175
+ # ── 1. Sequential test score (weight: 0.40) ──────────────────────────
176
+ # IMPORTANT: Only count agent-submitted attempts, NOT the initial buggy
177
+ # code. The buggy code passes all 8 sequential tests β€” if we used
178
+ # best_tests_passed from environment state, every agent would score
179
+ # 0.40 for free without fixing anything. We recalculate from attempts.
180
+ if attempts:
181
+ agent_best_sequential = max(
182
+ a.get("tests_passed", 0) for a in attempts
183
+ )
184
+ else:
185
+ agent_best_sequential = 0 # No attempts submitted β†’ 0.0
186
+
187
+ sequential_ratio = agent_best_sequential / tests_total if tests_total > 0 else 0.0
188
+ sequential_score = sequential_ratio * 0.40
189
 
190
+ # ── 2. Concurrent stress test (weight: 0.30) ──────────────────────────
191
+ # Use the best attempt by sequential test count (ties broken by recency).
192
+ # Run the stress test 3 times β€” must pass all 3 for full credit,
193
+ # at least 1 for partial credit. This handles non-determinism fairly.
194
  concurrent_score = 0.0
195
  if attempts:
 
196
  best_attempt = max(
197
  attempts,
198
  key=lambda a: (a.get("tests_passed", 0), a.get("attempt_number", 0))
199
  )
200
+ best_code = best_attempt.get("code_submitted", "").strip()
201
+
202
  if best_code:
 
203
  passes = sum(
204
  1 for _ in range(3)
205
  if self._run_concurrent_stress_test(best_code)
206
  )
207
  if passes == 3:
208
+ concurrent_score = 0.30 # Fully correct fix
209
  elif passes >= 1:
210
+ concurrent_score = 0.15 # Partially correct β€” inconsistent
211
 
212
+ # ── 3. Hypothesis accuracy (weight: 0.20) ─────────────────────────────
213
  if hypotheses:
214
  matches = sum(
215
  1 for h in hypotheses
 
220
  hypothesis_ratio = 0.0
221
  hypothesis_score = hypothesis_ratio * 0.20
222
 
223
+ # ── 4. Efficiency bonus (weight: 0.10) ────────────────────────────────
224
+ # Only awarded if the agent actually fixed the concurrent bug too,
225
+ # not just for submitting fewer attempts on a wrong fix.
226
+ efficiency_score = 0.10 if (concurrent_score == 0.30 and attempts_used <= 5) else 0.0
227
 
228
+ total = sequential_score + concurrent_score + hypothesis_score + efficiency_score
229
+ return self._clamp(total)
openenv.yaml CHANGED
@@ -55,7 +55,7 @@ baseline:
55
  hard: 0.18
56
  author: shashaank
57
  license: MIT
58
- huggingface_space: shashaank/agentdebugger-env
59
  api_base_url_env_var: API_BASE_URL
60
  model_name_env_var: MODEL_NAME
61
  hf_token_env_var: HF_TOKEN
 
55
  hard: 0.18
56
  author: shashaank
57
  license: MIT
58
+ huggingface_space: shashaank0707/AgentDebugger-env
59
  api_base_url_env_var: API_BASE_URL
60
  model_name_env_var: MODEL_NAME
61
  hf_token_env_var: HF_TOKEN