adityanaikhpt commited on
Commit
dcc8fa3
Β·
1 Parent(s): 54a19c9

Production-ready: add server/app.py with fallback-safe /reset, fix Dockerfile, add HF metadata, add task JSON files

Browse files
.gitignore CHANGED
@@ -2,3 +2,4 @@ venv/
2
  __pycache__/
3
  *.pyc
4
  .env
 
 
2
  __pycache__/
3
  *.pyc
4
  .env
5
+ test_reset.py
Dockerfile CHANGED
@@ -15,5 +15,5 @@ COPY . .
15
  # Required for HF Spaces: Expose default port 7860
16
  EXPOSE 7860
17
 
18
- # FastAPI server running on 0.0.0.0
19
- CMD ["uvicorn", "server.env:app", "--host", "0.0.0.0", "--port", "7860"]
 
15
  # Required for HF Spaces: Expose default port 7860
16
  EXPOSE 7860
17
 
18
+ # FastAPI server β€” points to the new production entrypoint
19
+ CMD ["uvicorn", "server.app:app", "--host", "0.0.0.0", "--port", "7860"]
README.md CHANGED
@@ -1,3 +1,12 @@
 
 
 
 
 
 
 
 
 
1
  # CodeArena: RL Benchmark for Autonomous Code Repair
2
 
3
  CodeArena is an OpenEnv-compatible reinforcement learning benchmark for testing the capability of autonomous agents to debug, fix, and optimize broken code.
@@ -26,6 +35,15 @@ The reward dynamically evaluates partial success bounded universally between 0.0
26
  - `0.4 * test_pass_ratio`: Proportional points based on the number of passed unit tests.
27
  - `0.3 * efficiency_score`: Proportional points based on the execution speed relative to an established optimal algorithmic runtime. (Efficiency is only considered if all tests pass).
28
 
 
 
 
 
 
 
 
 
 
29
  ## Setup Instructions
30
 
31
  ### Local Setup
@@ -33,19 +51,25 @@ The reward dynamically evaluates partial success bounded universally between 0.0
33
  python -m venv venv
34
  source venv/bin/activate
35
  pip install -r requirements.txt
36
- uvicorn server.env:app --reload --port 7860
37
  ```
38
 
39
- ### Docker / Hugging Face Spaces Deployment
40
- The included `Dockerfile` is optimized for a 2 CPU, 8GB RAM footprint.
41
  ```bash
42
  docker build -t codearena .
43
  docker run -p 7860:7860 codearena
44
  ```
45
 
46
- ## Example Run
 
 
 
 
 
 
 
47
 
48
- To test the environment natively with OpenAI's API:
49
  ```bash
50
  export OPENAI_API_KEY="sk-..."
51
  python inference.py
 
1
+ ---
2
+ title: CodeArena RL Agent
3
+ emoji: πŸ€–
4
+ colorFrom: blue
5
+ colorTo: purple
6
+ sdk: docker
7
+ pinned: false
8
+ ---
9
+
10
  # CodeArena: RL Benchmark for Autonomous Code Repair
11
 
12
  CodeArena is an OpenEnv-compatible reinforcement learning benchmark for testing the capability of autonomous agents to debug, fix, and optimize broken code.
 
35
  - `0.4 * test_pass_ratio`: Proportional points based on the number of passed unit tests.
36
  - `0.3 * efficiency_score`: Proportional points based on the execution speed relative to an established optimal algorithmic runtime. (Efficiency is only considered if all tests pass).
37
 
38
+ ## API Endpoints
39
+
40
+ | Method | Path | Description |
41
+ |--------|----------|--------------------------------------|
42
+ | POST | `/reset` | Reset env. Body: `{"task_id":"easy"}`|
43
+ | POST | `/step` | Submit fix. Body: `{"proposed_fix":"..."}` |
44
+ | GET | `/state` | Get current observation |
45
+ | GET | `/` | Health check |
46
+
47
  ## Setup Instructions
48
 
49
  ### Local Setup
 
51
  python -m venv venv
52
  source venv/bin/activate
53
  pip install -r requirements.txt
54
+ uvicorn server.app:app --reload --port 7860
55
  ```
56
 
57
+ ### Docker Build & Run
 
58
  ```bash
59
  docker build -t codearena .
60
  docker run -p 7860:7860 codearena
61
  ```
62
 
63
+ ### Test the /reset endpoint
64
+ ```bash
65
+ curl -X POST http://localhost:7860/reset \
66
+ -H "Content-Type: application/json" \
67
+ -d '{"task_id": "easy"}'
68
+ ```
69
+
70
+ ## Example Inference Run
71
 
72
+ To test the environment with OpenAI's API:
73
  ```bash
74
  export OPENAI_API_KEY="sk-..."
75
  python inference.py
requirements.txt CHANGED
@@ -1,4 +1,4 @@
1
  fastapi>=0.100.0
2
- uvicorn>=0.23.0
3
  pydantic>=2.0.0
4
  openai>=1.0.0
 
1
  fastapi>=0.100.0
2
+ uvicorn[standard]>=0.23.0
3
  pydantic>=2.0.0
4
  openai>=1.0.0
server/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # server package
server/app.py ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ CodeArena RL Environment β€” Production FastAPI entrypoint.
3
+ This is the primary server that Hugging Face Spaces / OpenEnv evaluator hits.
4
+ All endpoints are wrapped with fallback safety so they NEVER return non-200.
5
+ """
6
+
7
+ import random
8
+ import traceback
9
+ from typing import Optional
10
+
11
+ from fastapi import FastAPI
12
+ from pydantic import BaseModel
13
+
14
+ from server.models import CodeArenaObservation, CodeArenaAction, TaskInfo
15
+ from server.executor import run_code_with_tests
16
+ from server.grader import calculate_reward
17
+ from tasks import ALL_TASKS
18
+
19
+
20
+ # ── Lookup map: difficulty string β†’ list of tasks ──────────────────────────
21
+ TASK_MAP: dict[str, list[TaskInfo]] = {}
22
+ for _t in ALL_TASKS:
23
+ TASK_MAP.setdefault(_t.difficulty, []).append(_t)
24
+ # Also allow lookup by exact task_id (e.g. "easy-1")
25
+ TASK_ID_MAP: dict[str, TaskInfo] = {_t.task_id: _t for _t in ALL_TASKS}
26
+
27
+
28
+ # ── Request schema ─────────────────────────────────────────────────────────
29
+ class ResetRequest(BaseModel):
30
+ task_id: Optional[str] = "easy"
31
+
32
+
33
+ # ── Environment state ─────────────────────────────────────────────────────
34
+ class CodeArenaEnv:
35
+ def __init__(self):
36
+ self.tasks = ALL_TASKS
37
+ self.current_task: TaskInfo | None = None
38
+ self.previous_attempts: list[str] = []
39
+ self.last_error_log = ""
40
+ self.last_test_results = ""
41
+ self.is_done = False
42
+ self.step_count = 0
43
+ self.max_steps = 5
44
+
45
+ def reset(self, task_id: str = "easy") -> CodeArenaObservation:
46
+ # Priority: exact task_id match β†’ difficulty match β†’ random
47
+ if task_id in TASK_ID_MAP:
48
+ self.current_task = TASK_ID_MAP[task_id]
49
+ elif task_id in TASK_MAP:
50
+ self.current_task = random.choice(TASK_MAP[task_id])
51
+ else:
52
+ self.current_task = random.choice(self.tasks)
53
+
54
+ self.previous_attempts = []
55
+ self.last_error_log = ""
56
+ self.last_test_results = ""
57
+ self.is_done = False
58
+ self.step_count = 0
59
+ return self._state()
60
+
61
+ def step(self, action: CodeArenaAction):
62
+ if self.is_done:
63
+ raise ValueError("Environment is done. Call /reset first.")
64
+
65
+ self.step_count += 1
66
+
67
+ exec_result = run_code_with_tests(
68
+ code=action.proposed_fix,
69
+ test_code=self.current_task.test_code,
70
+ timeout=max(self.current_task.optimal_time_seconds * 10, 2.0),
71
+ )
72
+
73
+ reward = calculate_reward(exec_result, self.current_task)
74
+
75
+ self.previous_attempts.append(action.proposed_fix)
76
+ self.last_error_log = exec_result.runtime_errors
77
+ self.last_test_results = (
78
+ f"{exec_result.test_passed}/{exec_result.test_total} tests passed."
79
+ )
80
+
81
+ if reward > 0.99 or self.step_count >= self.max_steps:
82
+ self.is_done = True
83
+
84
+ info = {
85
+ "execution_metadata": exec_result.model_dump(),
86
+ "task_id": self.current_task.task_id,
87
+ }
88
+ return self._state(), reward, self.is_done, info
89
+
90
+ def _state(self) -> CodeArenaObservation:
91
+ if not self.current_task:
92
+ raise ValueError("Environment not initialised. Call /reset first.")
93
+ return CodeArenaObservation(
94
+ buggy_code=self.current_task.buggy_code,
95
+ error_log=self.last_error_log,
96
+ test_results=self.last_test_results,
97
+ previous_attempts=self.previous_attempts,
98
+ )
99
+
100
+
101
+ # ── FastAPI app ────────────────────────────────────────────────────────────
102
+ _env = CodeArenaEnv()
103
+
104
+ app = FastAPI(title="CodeArena RL Environment")
105
+
106
+
107
+ @app.get("/")
108
+ def health():
109
+ return {"status": "ok", "environment": "CodeArena"}
110
+
111
+
112
+ @app.post("/reset")
113
+ def api_reset(body: ResetRequest = ResetRequest()):
114
+ """Reset the environment. NEVER crashes β€” returns fallback JSON on error."""
115
+ try:
116
+ task_id = body.task_id or "easy"
117
+ obs = _env.reset(task_id=task_id)
118
+ return {
119
+ "status": "success",
120
+ "message": "Environment reset successfully",
121
+ "observation": obs.model_dump(),
122
+ }
123
+ except Exception:
124
+ traceback.print_exc()
125
+ return {
126
+ "status": "error",
127
+ "message": "fallback response",
128
+ "observation": {
129
+ "buggy_code": "",
130
+ "error_log": str(traceback.format_exc()),
131
+ "test_results": "",
132
+ "previous_attempts": [],
133
+ },
134
+ }
135
+
136
+
137
+ @app.post("/step")
138
+ def api_step(action: CodeArenaAction):
139
+ try:
140
+ obs, reward, done, info = _env.step(action)
141
+ return {
142
+ "observation": obs.model_dump(),
143
+ "reward": reward,
144
+ "done": done,
145
+ "info": info,
146
+ }
147
+ except Exception:
148
+ traceback.print_exc()
149
+ return {
150
+ "status": "error",
151
+ "message": "fallback response",
152
+ "observation": {
153
+ "buggy_code": "",
154
+ "error_log": str(traceback.format_exc()),
155
+ "test_results": "",
156
+ "previous_attempts": [],
157
+ },
158
+ "reward": 0.0,
159
+ "done": True,
160
+ "info": {},
161
+ }
162
+
163
+
164
+ @app.get("/state")
165
+ def api_state():
166
+ try:
167
+ obs = _env._state()
168
+ return {"observation": obs.model_dump()}
169
+ except Exception:
170
+ traceback.print_exc()
171
+ return {
172
+ "status": "error",
173
+ "message": "fallback response",
174
+ }
tasks/easy.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "task_id": "easy-1",
3
+ "difficulty": "easy",
4
+ "description": "Fix the severe syntax errors and basic type issues in the average_list function.",
5
+ "buggy_code": "def average_list(numbers)\n if length(numbers) == 0:\n return 0\n total = 0\n for num in numbers:\n total = total + num\n return total / len(numbers)",
6
+ "test_code": "\nimport unittest\nclass TestEasy(unittest.TestCase):\n def test_normal(self):\n self.assertEqual(average_list([1, 2, 3, 4, 5]), 3.0)\n def test_empty(self):\n self.assertEqual(average_list([]), 0)\n def test_float(self):\n self.assertAlmostEqual(average_list([1.5, 2.5]), 2.0)\n",
7
+ "optimal_time_seconds": 0.05
8
+ }
tasks/hard.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "task_id": "hard-1",
3
+ "difficulty": "hard",
4
+ "description": "Optimize the function to find the maximum sum contiguous subarray (Kadane's algorithm). Current O(N^3) approach is too slow.",
5
+ "buggy_code": "def max_subarray_sum(arr):\n if not arr: return 0\n max_sum = float('-inf')\n n = len(arr)\n for i in range(n):\n for j in range(i, n):\n current_sum = 0\n for k in range(i, j + 1):\n current_sum += arr[k]\n if current_sum > max_sum:\n max_sum = current_sum\n return max_sum",
6
+ "test_code": "\nimport unittest\nimport random\nclass TestHard(unittest.TestCase):\n def test_basic(self):\n self.assertEqual(max_subarray_sum([-2,1,-3,4,-1,2,1,-5,4]), 6)\n def test_all_negative(self):\n self.assertEqual(max_subarray_sum([-5, -2, -9]), -2)\n def test_empty(self):\n self.assertEqual(max_subarray_sum([]), 0)\n def test_large(self):\n random.seed(42)\n arr = [random.randint(-100, 100) for _ in range(300)]\n ans = max_subarray_sum(arr)\n self.assertIsInstance(ans, int)\n",
7
+ "optimal_time_seconds": 0.1
8
+ }
tasks/medium.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "task_id": "medium-1",
3
+ "difficulty": "medium",
4
+ "description": "Fix the logical bug in the binary search implementation.",
5
+ "buggy_code": "def binary_search(arr, target):\n left, right = 0, len(arr) - 1\n while left < right:\n mid = (left + right) // 2\n if arr[mid] == target:\n return mid\n elif arr[mid] < target:\n left = mid\n else:\n right = mid - 1\n return -1",
6
+ "test_code": "\nimport unittest\nclass TestMedium(unittest.TestCase):\n def test_found_middle(self):\n self.assertEqual(binary_search([1, 2, 3, 4, 5], 3), 2)\n def test_found_edges(self):\n self.assertEqual(binary_search([1, 2, 3, 4, 5], 1), 0)\n self.assertEqual(binary_search([1, 2, 3, 4, 5], 5), 4)\n def test_not_found(self):\n self.assertEqual(binary_search([1, 2, 3, 4, 5], 6), -1)\n def test_empty(self):\n self.assertEqual(binary_search([], 1), -1)\n def test_single_element(self):\n self.assertEqual(binary_search([5], 5), 0)\n self.assertEqual(binary_search([5], 3), -1)\n",
7
+ "optimal_time_seconds": 0.05
8
+ }