Parthiban007 commited on
Commit
8a096e2
·
verified ·
1 Parent(s): 2154988

Upload folder using huggingface_hub

Browse files
Files changed (2) hide show
  1. openenv.yaml +102 -22
  2. server/app.py +90 -0
openenv.yaml CHANGED
@@ -13,37 +13,117 @@ tags:
13
  - coding-benchmark
14
 
15
  # Task Definition (Easy -> Medium -> Hard)
 
16
  tasks:
17
- - id: 1
18
  title: "Broken CLI Argument Parser"
19
- difficulty: "Easy"
20
- - id: 2
21
- title: "Conflicting Borrows"
22
- difficulty: "Easy"
23
- - id: 3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  title: "Lifetime Annotations"
25
- difficulty: "Medium"
26
- - id: 4
27
- title: "Business Logic"
28
- difficulty: "Medium"
29
- - id: 5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  title: "Linked List Management"
31
- difficulty: "Medium"
32
- - id: 6
 
 
 
 
 
 
 
 
33
  title: "Multi-threaded Deadlocks"
34
- difficulty: "Hard"
35
- - id: 7
 
 
 
 
 
 
 
 
36
  title: "Async Borrowing"
37
- difficulty: "Hard"
38
- - id: 8
 
 
 
 
 
 
 
 
39
  title: "Unsafe FFI Integration"
40
- difficulty: "Hard"
41
- - id: 9
 
 
 
 
 
 
 
 
42
  title: "Inefficient Data Pipelines"
43
- difficulty: "Hard"
44
- - id: 10
 
 
 
 
 
 
 
 
45
  title: "Memory Leak Prevention"
46
- difficulty: "Hard+"
 
 
 
 
 
 
 
47
 
48
  # Definitions for Documentation and Graders
49
  action_space:
 
13
  - coding-benchmark
14
 
15
  # Task Definition (Easy -> Medium -> Hard)
16
+ # Each task has a grader that scores submissions 0.0-1.0
17
  tasks:
18
+ - id: "task_1"
19
  title: "Broken CLI Argument Parser"
20
+ difficulty: "easy"
21
+ description: "Fix enum variant mismatches and incomplete match arms in a CLI argument parser."
22
+ grader:
23
+ type: "programmatic"
24
+ endpoint: "/grade/task_1"
25
+ success_threshold: 0.7
26
+ reward_range: [0.0, 1.0]
27
+ description: "Compilation(40%) + Correctness(20%) + Coverage(20%) + Elegance(10%) + Efficiency(10%)"
28
+
29
+ - id: "task_2"
30
+ title: "Conflicting Borrows in Collection Processing"
31
+ difficulty: "easy"
32
+ description: "Resolve mutable/immutable borrow conflicts in a string collection processor."
33
+ grader:
34
+ type: "programmatic"
35
+ endpoint: "/grade/task_2"
36
+ success_threshold: 0.7
37
+ reward_range: [0.0, 1.0]
38
+ description: "Compilation(40%) + Correctness(20%) + Coverage(20%) + Elegance(10%) + Efficiency(10%)"
39
+
40
+ - id: "task_3"
41
  title: "Lifetime Annotations"
42
+ difficulty: "medium"
43
+ description: "Add correct lifetime annotations to enable a struct holding references to work properly."
44
+ grader:
45
+ type: "programmatic"
46
+ endpoint: "/grade/task_3"
47
+ success_threshold: 0.6
48
+ reward_range: [0.0, 1.0]
49
+ description: "Compilation(40%) + Correctness(20%) + Coverage(20%) + Elegance(10%) + Efficiency(10%)"
50
+
51
+ - id: "task_4"
52
+ title: "Business Logic Bug"
53
+ difficulty: "medium"
54
+ description: "Fix off-by-one errors and logic bugs in a financial calculation module."
55
+ grader:
56
+ type: "programmatic"
57
+ endpoint: "/grade/task_4"
58
+ success_threshold: 0.6
59
+ reward_range: [0.0, 1.0]
60
+ description: "Compilation(40%) + Correctness(20%) + Coverage(20%) + Elegance(10%) + Efficiency(10%)"
61
+
62
+ - id: "task_5"
63
  title: "Linked List Management"
64
+ difficulty: "medium"
65
+ description: "Implement a safe singly-linked list with push, pop, and peek operations."
66
+ grader:
67
+ type: "programmatic"
68
+ endpoint: "/grade/task_5"
69
+ success_threshold: 0.6
70
+ reward_range: [0.0, 1.0]
71
+ description: "Compilation(40%) + Correctness(20%) + Coverage(20%) + Elegance(10%) + Efficiency(10%)"
72
+
73
+ - id: "task_6"
74
  title: "Multi-threaded Deadlocks"
75
+ difficulty: "hard"
76
+ description: "Identify and fix deadlock conditions in a multi-threaded producer-consumer pattern."
77
+ grader:
78
+ type: "programmatic"
79
+ endpoint: "/grade/task_6"
80
+ success_threshold: 0.5
81
+ reward_range: [0.0, 1.0]
82
+ description: "Compilation(40%) + Correctness(20%) + Coverage(20%) + Elegance(10%) + Efficiency(10%)"
83
+
84
+ - id: "task_7"
85
  title: "Async Borrowing"
86
+ difficulty: "hard"
87
+ description: "Fix async/await borrowing conflicts in a concurrent file processor."
88
+ grader:
89
+ type: "programmatic"
90
+ endpoint: "/grade/task_7"
91
+ success_threshold: 0.5
92
+ reward_range: [0.0, 1.0]
93
+ description: "Compilation(40%) + Correctness(20%) + Coverage(20%) + Elegance(10%) + Efficiency(10%)"
94
+
95
+ - id: "task_8"
96
  title: "Unsafe FFI Integration"
97
+ difficulty: "hard"
98
+ description: "Write safe Rust wrappers around unsafe FFI calls to a C library."
99
+ grader:
100
+ type: "programmatic"
101
+ endpoint: "/grade/task_8"
102
+ success_threshold: 0.5
103
+ reward_range: [0.0, 1.0]
104
+ description: "Compilation(40%) + Correctness(20%) + Coverage(20%) + Elegance(10%) + Efficiency(10%)"
105
+
106
+ - id: "task_9"
107
  title: "Inefficient Data Pipelines"
108
+ difficulty: "hard"
109
+ description: "Optimize a data transformation pipeline using iterators and avoiding unnecessary allocations."
110
+ grader:
111
+ type: "programmatic"
112
+ endpoint: "/grade/task_9"
113
+ success_threshold: 0.5
114
+ reward_range: [0.0, 1.0]
115
+ description: "Compilation(40%) + Correctness(20%) + Coverage(20%) + Elegance(10%) + Efficiency(10%)"
116
+
117
+ - id: "task_10"
118
  title: "Memory Leak Prevention"
119
+ difficulty: "hard"
120
+ description: "Fix memory leak patterns in a custom allocator and ensure proper Drop implementations."
121
+ grader:
122
+ type: "programmatic"
123
+ endpoint: "/grade/task_10"
124
+ success_threshold: 0.4
125
+ reward_range: [0.0, 1.0]
126
+ description: "Compilation(40%) + Correctness(20%) + Coverage(20%) + Elegance(10%) + Efficiency(10%)"
127
 
128
  # Definitions for Documentation and Graders
129
  action_space:
server/app.py CHANGED
@@ -9,12 +9,19 @@ Endpoints (provided by OpenEnv `create_app`):
9
  - GET /state
10
  - GET /schema
11
  - WS /ws
 
 
 
 
 
12
  """
13
 
14
  import os
15
  import logging
16
 
17
  from dotenv import load_dotenv
 
 
18
  from openenv.core.env_server.http_server import create_app
19
 
20
  from models import RustCoderAction, RustCoderObservation
@@ -37,11 +44,94 @@ app = create_app(
37
  )
38
 
39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  @app.get("/health")
41
  async def health_check():
42
  return {"status": "healthy"}
43
 
44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  def main(host: str = "0.0.0.0", port: int = 8000) -> None:
46
  import uvicorn
47
 
 
9
  - GET /state
10
  - GET /schema
11
  - WS /ws
12
+
13
+ Additional endpoints:
14
+ - GET /health
15
+ - GET /tasks — list all tasks with grader metadata
16
+ - POST /grade/{task_id} — grade a code submission for a specific task
17
  """
18
 
19
  import os
20
  import logging
21
 
22
  from dotenv import load_dotenv
23
+ from fastapi import HTTPException
24
+ from pydantic import BaseModel
25
  from openenv.core.env_server.http_server import create_app
26
 
27
  from models import RustCoderAction, RustCoderObservation
 
44
  )
45
 
46
 
47
+ # ---------------------------------------------------------------------------
48
+ # Task metadata — mirrors openenv.yaml tasks section
49
+ # ---------------------------------------------------------------------------
50
+
51
+ _TASK_REGISTRY = [
52
+ {"id": "task_1", "index": 0, "title": "Broken CLI Argument Parser", "difficulty": "easy", "success_threshold": 0.7},
53
+ {"id": "task_2", "index": 1, "title": "Conflicting Borrows in Collection Processing", "difficulty": "easy", "success_threshold": 0.7},
54
+ {"id": "task_3", "index": 2, "title": "Lifetime Annotations", "difficulty": "medium", "success_threshold": 0.6},
55
+ {"id": "task_4", "index": 3, "title": "Business Logic Bug", "difficulty": "medium", "success_threshold": 0.6},
56
+ {"id": "task_5", "index": 4, "title": "Linked List Management", "difficulty": "medium", "success_threshold": 0.6},
57
+ {"id": "task_6", "index": 5, "title": "Multi-threaded Deadlocks", "difficulty": "hard", "success_threshold": 0.5},
58
+ {"id": "task_7", "index": 6, "title": "Async Borrowing", "difficulty": "hard", "success_threshold": 0.5},
59
+ {"id": "task_8", "index": 7, "title": "Unsafe FFI Integration", "difficulty": "hard", "success_threshold": 0.5},
60
+ {"id": "task_9", "index": 8, "title": "Inefficient Data Pipelines", "difficulty": "hard", "success_threshold": 0.5},
61
+ {"id": "task_10", "index": 9, "title": "Memory Leak Prevention", "difficulty": "hard", "success_threshold": 0.4},
62
+ ]
63
+ _TASK_BY_ID = {t["id"]: t for t in _TASK_REGISTRY}
64
+
65
+
66
+ # ---------------------------------------------------------------------------
67
+ # Endpoints
68
+ # ---------------------------------------------------------------------------
69
+
70
  @app.get("/health")
71
  async def health_check():
72
  return {"status": "healthy"}
73
 
74
 
75
+ @app.get("/tasks")
76
+ async def list_tasks():
77
+ """Return the list of all tasks with their grader metadata."""
78
+ tasks_out = []
79
+ for t in _TASK_REGISTRY:
80
+ tasks_out.append({
81
+ "id": t["id"],
82
+ "title": t["title"],
83
+ "difficulty": t["difficulty"],
84
+ "grader": {
85
+ "type": "programmatic",
86
+ "endpoint": f"/grade/{t['id']}",
87
+ "success_threshold": t["success_threshold"],
88
+ "reward_range": [0.0, 1.0],
89
+ "description": "Compilation(40%) + Correctness(20%) + Coverage(20%) + Elegance(10%) + Efficiency(10%)",
90
+ },
91
+ })
92
+ return {"tasks": tasks_out, "total": len(tasks_out)}
93
+
94
+
95
+ class GradeRequest(BaseModel):
96
+ code: str = ""
97
+
98
+
99
+ @app.post("/grade/{task_id}")
100
+ async def grade_task(task_id: str, request: GradeRequest):
101
+ """
102
+ Grade a Rust code submission for a specific task.
103
+
104
+ Returns a score in [0.0, 1.0] with detailed breakdown.
105
+ This is the programmatic grader endpoint referenced in openenv.yaml.
106
+ """
107
+ task_meta = _TASK_BY_ID.get(task_id)
108
+ if task_meta is None:
109
+ raise HTTPException(status_code=404, detail=f"Task '{task_id}' not found.")
110
+
111
+ env = RustCoderEnvironment()
112
+ # Reset to the specific task
113
+ env.reset(start_index=task_meta["index"])
114
+
115
+ # Submit the code
116
+ action = RustCoderAction(code=request.code)
117
+ obs = env.step(action)
118
+
119
+ score = float(obs.reward) if obs.reward is not None else 0.0
120
+ score = max(0.0, min(1.0, score))
121
+ success = score >= task_meta["success_threshold"]
122
+
123
+ return {
124
+ "task_id": task_id,
125
+ "score": round(score, 4),
126
+ "success": success,
127
+ "success_threshold": task_meta["success_threshold"],
128
+ "reward_breakdown": obs.reward_breakdown,
129
+ "compilation_success": obs.compilation_success,
130
+ "compilation_output": obs.compilation_output,
131
+ "test_results": obs.test_results,
132
+ }
133
+
134
+
135
  def main(host: str = "0.0.0.0", port: int = 8000) -> None:
136
  import uvicorn
137