Parthiban007 commited on
Commit
e96c0d4
Β·
verified Β·
1 Parent(s): 83d47a9

Upload folder using huggingface_hub

Browse files
Files changed (7) hide show
  1. README.md +8 -2
  2. client.py +1 -1
  3. inference.py +22 -11
  4. models.py +1 -1
  5. problems.json +10 -10
  6. server/app.py +14 -223
  7. server/rust_coder_environment.py +60 -22
README.md CHANGED
@@ -49,7 +49,7 @@ The environment returns detailed feedback after each submission:
49
  | Field | Type | Description |
50
  |------------------------|-------------|-----------------------------------------------------|
51
  | `problem_description` | string | Task requirements and context |
52
- | `starter_code` | string | The intentionally broken code to fix |
53
  | `compilation_success` | bool | Whether `rustc` compiled the submitted code |
54
  | `compilation_output` | string | Raw compiler errors and warnings |
55
  | `test_results` | list[dict] | Per-test pass/fail results with error details |
@@ -213,6 +213,12 @@ rust_coder/
213
  β”œβ”€β”€ inference.py # Baseline inference script (entry point)
214
  β”œβ”€β”€ __init__.py # Package exports
215
  └── server/
216
- β”œβ”€β”€ app.py # FastAPI + Gradio server
217
  └── rust_coder_environment.py # Core environment logic
218
  ```
 
 
 
 
 
 
 
49
  | Field | Type | Description |
50
  |------------------------|-------------|-----------------------------------------------------|
51
  | `problem_description` | string | Task requirements and context |
52
+ | `header_section` | string | LeetCode-style scaffold (imports + signatures/types) |
53
  | `compilation_success` | bool | Whether `rustc` compiled the submitted code |
54
  | `compilation_output` | string | Raw compiler errors and warnings |
55
  | `test_results` | list[dict] | Per-test pass/fail results with error details |
 
213
  β”œβ”€β”€ inference.py # Baseline inference script (entry point)
214
  β”œβ”€β”€ __init__.py # Package exports
215
  └── server/
216
+ β”œβ”€β”€ app.py # FastAPI OpenEnv server entrypoint
217
  └── rust_coder_environment.py # Core environment logic
218
  ```
219
+
220
+ ## HF Space runtime model
221
+
222
+ - The Hugging Face Space serves the environment via `uvicorn server.app:app` (see `openenv.yaml` and `Dockerfile`).
223
+ - The built-in OpenEnv web UI may send an empty action on Step; this environment supports that by auto-calling the LLM when `action.code` is empty (unless disabled via `AUTO_LLM_ON_EMPTY_STEP=0`).
224
+ - `inference.py` is the required baseline runner used by the validator/judge. It connects to the running Space and drives `reset()`/`step()` in a loop, emitting strict `[START]`/`[STEP]`/`[END]` stdout lines.
client.py CHANGED
@@ -44,7 +44,7 @@ class RustCoderEnv(
44
  obs_data = payload.get("observation", {})
45
  observation = RustCoderObservation(
46
  problem_description=obs_data.get("problem_description", ""),
47
- starter_code=obs_data.get("starter_code", ""),
48
  compilation_success=obs_data.get("compilation_success", False),
49
  compilation_output=obs_data.get("compilation_output", ""),
50
  test_results=obs_data.get("test_results", []),
 
44
  obs_data = payload.get("observation", {})
45
  observation = RustCoderObservation(
46
  problem_description=obs_data.get("problem_description", ""),
47
+ header_section=obs_data.get("header_section", ""),
48
  compilation_success=obs_data.get("compilation_success", False),
49
  compilation_output=obs_data.get("compilation_output", ""),
50
  test_results=obs_data.get("test_results", []),
inference.py CHANGED
@@ -34,17 +34,28 @@ from models import RustCoderAction
34
 
35
  # --- Strict Logging Helpers ---
36
  def log_start(task: str, env: str, model: str):
37
- print(f'[START] task="{task}" env="{env}" model="{model}"', flush=True)
 
38
 
39
  def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str] = None):
40
- escaped_action = action.replace('\n', '\\n')[:100] + "..."
41
- log_line = f'[STEP] step={step} action="{escaped_action}" reward={reward:.4f} done={str(done).lower()}'
42
- if error:
43
- log_line += f' error="{error}"'
44
- print(log_line, flush=True)
 
 
 
 
 
45
 
46
  def log_end(success: bool, steps: int, score: float, rewards: List[float]):
47
- print(f'[END] success={str(success).lower()} steps={steps} score={score:.4f} rewards={json.dumps(rewards)}', flush=True)
 
 
 
 
 
48
 
49
  # --- LLM Solution Logic ---
50
  async def get_model_code(prompt: str, client: OpenAI) -> str:
@@ -110,10 +121,10 @@ async def main():
110
 
111
  steps_taken = step
112
 
113
- # Format prompt including starter code if available
114
  prompt = obs.problem_description
115
- if obs.starter_code:
116
- prompt += f"\n\nStarter Code:\n```rust\n{obs.starter_code}\n```"
117
 
118
  # 1. Ask model for solution to current task
119
  code_solution = await get_model_code(prompt, client)
@@ -126,7 +137,7 @@ async def main():
126
  done = result.done
127
 
128
  rewards.append(reward)
129
- log_step(step=step, action=code_solution, reward=reward, done=done)
130
 
131
  if done:
132
  break
 
34
 
35
  # --- Strict Logging Helpers ---
36
  def log_start(task: str, env: str, model: str):
37
+ # REQUIRED exact stdout format (no quotes)
38
+ print(f"[START] task={task} env={env} model={model}", flush=True)
39
 
40
  def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str] = None):
41
+ # REQUIRED exact stdout format:
42
+ # [STEP] step=<n> action=<action_str> reward=<0.00> done=<true|false> error=<msg|null>
43
+ action_str = (action or "").replace("\r", "\\r").replace("\n", "\\n")
44
+ action_str = action_str[:200] # keep single-line + bounded
45
+ err_field = "null" if error is None else str(error).replace("\r", "\\r").replace("\n", "\\n")
46
+ reward_2 = f"{float(reward or 0.0):.2f}"
47
+ print(
48
+ f"[STEP] step={step} action={action_str} reward={reward_2} done={str(bool(done)).lower()} error={err_field}",
49
+ flush=True,
50
+ )
51
 
52
  def log_end(success: bool, steps: int, score: float, rewards: List[float]):
53
+ # REQUIRED exact stdout format, rewards as comma-separated 2dp
54
+ rewards_str = ",".join(f"{float(r or 0.0):.2f}" for r in rewards)
55
+ print(
56
+ f"[END] success={str(bool(success)).lower()} steps={steps} score={float(score or 0.0):.2f} rewards={rewards_str}",
57
+ flush=True,
58
+ )
59
 
60
  # --- LLM Solution Logic ---
61
  async def get_model_code(prompt: str, client: OpenAI) -> str:
 
121
 
122
  steps_taken = step
123
 
124
+ # Format prompt including header_section if available
125
  prompt = obs.problem_description
126
+ if getattr(obs, "header_section", ""):
127
+ prompt += f"\n\nHeader Section (must be included verbatim in final code):\n```rust\n{obs.header_section}\n```"
128
 
129
  # 1. Ask model for solution to current task
130
  code_solution = await get_model_code(prompt, client)
 
137
  done = result.done
138
 
139
  rewards.append(reward)
140
+ log_step(step=step, action=code_solution, reward=reward, done=done, error=None)
141
 
142
  if done:
143
  break
models.py CHANGED
@@ -24,7 +24,7 @@ class RustCoderObservation(Observation):
24
  """Observation space for the Rust Coder environment."""
25
 
26
  problem_description: str = Field(default="", description="The text description of the current coding task, including requirements.")
27
- starter_code: str = Field(default="", description="The specific Rust code snippet that needs fixing for this task.")
28
  compilation_success: bool = Field(default=False, description="Binary flag indicating if the last submission compiled.")
29
  compilation_output: str = Field(default="", description="Raw stdout/stderr from the rustc compiler.")
30
  test_results: list[dict] = Field(default_factory=list, description="A list of results from automated test assertions.")
 
24
  """Observation space for the Rust Coder environment."""
25
 
26
  problem_description: str = Field(default="", description="The text description of the current coding task, including requirements.")
27
+ header_section: str = Field(default="", description="LeetCode-style header/scaffold (imports + signatures/types) for deterministic evaluation.")
28
  compilation_success: bool = Field(default=False, description="Binary flag indicating if the last submission compiled.")
29
  compilation_output: str = Field(default="", description="Raw stdout/stderr from the rustc compiler.")
30
  test_results: list[dict] = Field(default_factory=list, description="A list of results from automated test assertions.")
problems.json CHANGED
@@ -4,7 +4,7 @@
4
  "title": "Broken CLI Argument Parser",
5
  "difficulty": "Easy",
6
  "description": "Fix a command-line tool that parses user input to determine file operations (read, write, append). The implementation uses enums and pattern matching but contains: Mismatched types in enum variants, Incomplete match arms, Incorrect handling of optional arguments. The parser must compile and correctly interpret valid command-line inputs like: 'read file.txt' -> FileOp::Read('file.txt'), 'write file.txt content' -> FileOp::Write('file.txt', Some('content')), 'append file.txt' -> FileOp::Append('file.txt')",
7
- "starter_code": "#[derive(Debug, PartialEq)]\nenum FileOp {\n Read(String),\n Write(String, Option<String>),\n Append(String),\n}\n\nfn parse_command(input: &str) -> Option<FileOp> {\n let parts: Vec<&str> = input.split_whitespace().collect();\n \n match parts.get(0) {\n Some(&\"read\") => {\n let filename = parts.get(1)?;\n FileOp::Read(filename.to_string()) // BUG: Missing Some()\n }\n Some(&\"write\") => {\n let filename = parts.get(1)?;\n let content = parts.get(2).map(|s| s.to_string());\n Some(FileOp::Write(filename.to_string(), content))\n }\n Some(&\"append\") => {\n let filename = parts.get(1)?;\n // BUG: Missing return statement\n }\n _ => None,\n }\n}\n\nfn main() {\n println!(\"CLI Parser Test\");\n}",
8
  "tests": [
9
  {
10
  "name": "parse_read_command",
@@ -35,7 +35,7 @@
35
  "title": "Conflicting Borrows in Collection Processing",
36
  "difficulty": "Easy\u2192Medium",
37
  "description": "Fix a function that processes a vector of strings while conditionally modifying elements and storing references for later use. The implementation mixes mutable and immutable borrows within the same scope, causing borrow checker conflicts. Requirements: Iterate through vector of strings, Store uppercase versions in a results vector, Handle optional transformations without borrowing conflicts, Must compile and execute without panics",
38
- "starter_code": "fn process_strings(strings: &mut Vec<String>) -> Vec<String> {\n let mut results = Vec::new();\n \n for s in strings {\n // BUG: Cannot borrow as mutable while immutable borrow is active\n let upper = s.to_uppercase();\n s.push_str(\"_processed\"); // Mutable borrow\n results.push(upper);\n }\n \n results\n}\n\nfn main() {\n println!(\"String processing\");\n}",
39
  "tests": [
40
  {
41
  "name": "process_single_string",
@@ -57,7 +57,7 @@
57
  "title": "Invalid Lifetime Annotations in Text API",
58
  "difficulty": "Medium",
59
  "description": "Fix a text-processing utility that accepts multiple string slices and returns a reference derived from them. The function either fails to compile or produces incorrect lifetime relationships, risking references that outlive their input data. Requirements: Function must accept multiple &str parameters, Return a &str derived from the inputs, Properly annotate lifetimes, Must be safe (no dangling references)",
60
- "starter_code": "// BUG: Invalid lifetime annotations - which lifetime should the return type use?\nfn longest_text<'a>(s1: &'a str, s2: &'a str) -> &'a str {\n if s1.len() > s2.len() {\n s1\n } else {\n s2\n }\n}\n\n// BUG: This function has a lifetime issue\nfn find_first_word(s: &str) -> &str {\n let bytes = s.as_bytes();\n for (i, &byte) in bytes.iter().enumerate() {\n if byte == b' ' {\n return &s[0..i];\n }\n }\n &s[..]\n}\n\nfn main() {\n println!(\"Lifetime test\");\n}",
61
  "tests": [
62
  {
63
  "name": "longest_text_basic",
@@ -88,7 +88,7 @@
88
  "title": "Business Logic Producing Incorrect Results",
89
  "difficulty": "Medium",
90
  "description": "Fix a module implementing order validation logic including pricing, discounts, and boundary conditions. The code compiles but produces incorrect outputs for edge cases such as: Zero values, Overlapping discounts, Large numeric inputs, Negative prices. Requirements: Calculate order total correctly, Apply discounts properly (no double-counting), Handle edge cases (zero items, negative values), Be mathematically sound",
91
- "starter_code": "#[derive(Debug, Clone)]\nstruct Order {\n quantity: i32,\n unit_price: f64,\n discount_percent: f64,\n}\n\nimpl Order {\n fn new(quantity: i32, unit_price: f64) -> Self {\n Order {\n quantity,\n unit_price,\n discount_percent: 0.0,\n }\n }\n\n fn with_discount(mut self, discount: f64) -> Self {\n self.discount_percent = discount;\n self\n }\n\n fn calculate_total(&self) -> f64 {\n let subtotal = self.quantity as f64 * self.unit_price;\n // BUG: Incorrect discount calculation\n let discount = subtotal * (self.discount_percent / 100.0);\n subtotal - discount // Missing rounding/validation\n }\n}\n\nfn main() {\n println!(\"Order test\");\n}",
92
  "tests": [
93
  {
94
  "name": "simple_order",
@@ -119,7 +119,7 @@
119
  "title": "Corrupted Singly Linked List",
120
  "difficulty": "Medium\u2192Hard",
121
  "description": "Fix a custom singly linked list that supports insertion, deletion, and traversal. The implementation incorrectly manages node ownership and pointer transitions, resulting in: Lost nodes, Inconsistent traversal output, Occasional runtime panics. Requirements: Insert elements at head, Delete elements correctly, Traverse without panics, No memory leaks or lost data",
122
- "starter_code": "use std::ptr;\n\n#[derive(Debug)]\nstruct Node<T> {\n value: T,\n next: Option<Box<Node<T>>>,\n}\n\n#[derive(Debug)]\nstruct LinkedList<T> {\n head: Option<Box<Node<T>>>,\n}\n\nimpl<T> LinkedList<T> {\n fn new() -> Self {\n LinkedList { head: None }\n }\n\n fn insert(&mut self, value: T) {\n let new_node = Box::new(Node {\n value,\n next: None, // BUG: Should move self.head into next\n });\n self.head = Some(new_node);\n }\n\n fn len(&self) -> usize {\n let mut count = 0;\n let mut current = &self.head;\n while let Some(node) = current {\n count += 1;\n current = &node.next; // Correct, but insert is broken\n }\n count\n }\n}\n\nfn main() {\n println!(\"LinkedList test\");\n}",
123
  "tests": [
124
  {
125
  "name": "insert_single_element",
@@ -143,7 +143,7 @@
143
  "title": "Deadlock in Multi-threaded Worker System",
144
  "difficulty": "Hard",
145
  "description": "Fix a worker system using multiple threads to process jobs from a shared queue protected by synchronization primitives. Under certain workloads, threads block indefinitely due to: Improper lock acquisition order, Shared state handling issues, Missing signal/wake mechanisms. Requirements: Spawn N worker threads, Process jobs from shared queue without deadlock, Handle shutdown gracefully, No panics under load",
146
- "starter_code": "use std::sync::{Arc, Mutex, mpsc};\nuse std::thread;\n\nfn worker_system(num_workers: usize, jobs: Vec<i32>) -> Vec<i32> {\n let (tx, rx) = mpsc::channel();\n let rx = Arc::new(Mutex::new(rx));\n let results = Arc::new(Mutex::new(Vec::new()));\n \n let mut handles = vec![];\n \n for _ in 0..num_workers {\n let rx = Arc::clone(&rx);\n let results = Arc::clone(&results);\n \n let handle = thread::spawn(move || {\n loop {\n // BUG: Lock acquired but never released before trying to acquire results lock\n let receiver = rx.lock().unwrap();\n match receiver.try_recv() {\n Ok(job) => {\n let result = job * 2;\n // BUG: Tries to lock results while still holding rx lock - DEADLOCK\n results.lock().unwrap().push(result);\n }\n Err(_) => break,\n }\n }\n });\n handles.push(handle);\n }\n \n for job in jobs {\n let _ = tx.send(job); // Ignore send errors\n }\n drop(tx);\n \n for handle in handles {\n let _ = handle.join();\n }\n \n Arc::try_unwrap(results)\n .unwrap()\n .into_inner()\n .unwrap()\n}\n\nfn main() {\n println!(\"Worker system test\");\n}",
147
  "tests": [
148
  {
149
  "name": "single_worker_single_job",
@@ -167,7 +167,7 @@
167
  "title": "Async Function with Borrowing Conflicts",
168
  "difficulty": "Hard",
169
  "description": "Fix an asynchronous function that processes input data and performs non-blocking operations while returning references tied to the input. The implementation violates borrowing constraints in an async context, leading to: Compilation errors when using references across await points, Invalid reference usage. Requirements: Accept &str input, Perform async operation, Return derived reference, Must be sound and compile",
170
- "starter_code": "use std::pin::Pin;\nuse std::future::Future;\n\n// BUG: Cannot return reference that outlives await point\nasync fn process_async(input: &str) -> &str {\n // Simulating async work\n // tokio::time::sleep(tokio::time::Duration::from_millis(10)).await;\n \n // BUG: input reference cannot be returned from async context like this\n input\n}\n\n// Better approach: return owned data or 'static reference\nfn process_sync(input: &str) -> String {\n input.to_uppercase()\n}\n\nfn main() {\n println!(\"Async test\");\n}",
171
  "tests": [
172
  {
173
  "name": "process_sync_basic",
@@ -191,7 +191,7 @@
191
  "title": "Unsafe FFI Integration Causing Crashes",
192
  "difficulty": "Hard",
193
  "description": "Fix Rust code that interfaces with an external C library using raw pointers. The implementation incorrectly handles: Pointer ownership, Memory allocation and deallocation, Undefined behavior risks. Requirements: Safely wrap C library calls, Properly manage memory (allocate/deallocate), No undefined behavior, Handle errors gracefully",
194
- "starter_code": "extern \"C\" {\n fn malloc(size: usize) -> *mut u8;\n fn free(ptr: *mut u8);\n}\n\nfn allocate_and_init(size: usize) -> Vec<u8> {\n unsafe {\n let ptr = malloc(size);\n // BUG: No null check - ptr could be null\n // BUG: Memory not initialized before use\n let slice = std::slice::from_raw_parts_mut(ptr, size);\n \n // Copy to vec and free\n let vec = slice.to_vec();\n free(ptr); // BUG: Freeing memory still referenced in vec\n vec\n }\n}\n\nfn main() {\n println!(\"FFI test\");\n}",
195
  "tests": [
196
  {
197
  "name": "allocate_small_buffer",
@@ -208,7 +208,7 @@
208
  "title": "Inefficient Data Processing Pipeline",
209
  "difficulty": "Hard",
210
  "description": "Fix a data pipeline that reads large datasets, applies transformations, and aggregates results. While functionally correct, the implementation has: Excessive memory allocations, Redundant iterations, Inefficient data copying. Requirements: Process data efficiently, Minimize allocations and copies, Use iterators when possible, Produce correct results with better performance",
211
- "starter_code": "fn process_data(numbers: Vec<i32>) -> i32 {\n // BUG: Multiple unnecessary allocations and iterations\n \n // First pass: filter evens (allocates new vector)\n let evens: Vec<i32> = numbers.iter()\n .filter(|n| n % 2 == 0)\n .copied()\n .collect();\n \n // Second pass: double values (allocates another vector)\n let doubled: Vec<i32> = evens.iter()\n .map(|n| n * 2)\n .collect();\n \n // Third pass: sum (unnecessary iteration)\n let sum: i32 = doubled.iter().sum();\n \n // Fourth pass: filter again (redundant)\n let final_sum: i32 = doubled.iter()\n .filter(|n| n % 4 == 0)\n .sum();\n \n final_sum\n}\n\nfn main() {\n println!(\"Efficiency test\");\n}",
212
  "tests": [
213
  {
214
  "name": "simple_pipeline",
@@ -232,7 +232,7 @@
232
  "title": "Reference-counted Cache with Memory Leak",
233
  "difficulty": "Hard+",
234
  "description": "Fix a caching system using reference-counted pointers to share data across components. The design creates cyclic references between cached objects, preventing memory from being released and causing memory usage to grow over time. Requirements: Implement caching without memory leaks, Break circular reference patterns, Use Rc/Arc correctly with Weak pointers when needed, Memory should be released when cache is cleared",
235
- "starter_code": "use std::rc::Rc;\nuse std::cell::RefCell;\n\n#[derive(Debug)]\nstruct CacheNode<T> {\n key: String,\n value: T,\n // BUG: This creates a cycle that prevents garbage collection\n related: RefCell<Option<Rc<CacheNode<T>>>>,\n}\n\n#[derive(Debug)]\nstruct Cache<T> {\n items: RefCell<Vec<Rc<CacheNode<T>>>>,\n}\n\nimpl<T: Clone> Cache<T> {\n fn new() -> Self {\n Cache {\n items: RefCell::new(Vec::new()),\n }\n }\n\n fn insert(&self, key: String, value: T) {\n let node = Rc::new(CacheNode {\n key,\n value,\n related: RefCell::new(None),\n });\n \n // BUG: Creating cyclic references\n if let Some(last) = self.items.borrow().last() {\n // Rc to Rc creates a cycle\n if let Ok(mut r) = last.related.try_borrow_mut() {\n *r = Some(Rc::clone(&node)); // Cycle here!\n }\n }\n \n self.items.borrow_mut().push(node);\n }\n}\n\nfn main() {\n println!(\"Cache test\");\n}",
236
  "tests": [
237
  {
238
  "name": "cache_insert_single",
 
4
  "title": "Broken CLI Argument Parser",
5
  "difficulty": "Easy",
6
  "description": "Fix a command-line tool that parses user input to determine file operations (read, write, append). The implementation uses enums and pattern matching but contains: Mismatched types in enum variants, Incomplete match arms, Incorrect handling of optional arguments. The parser must compile and correctly interpret valid command-line inputs like: 'read file.txt' -> FileOp::Read('file.txt'), 'write file.txt content' -> FileOp::Write('file.txt', Some('content')), 'append file.txt' -> FileOp::Append('file.txt')",
7
+ "header_section": "#[derive(Debug, PartialEq)]\nenum FileOp {\n Read(String),\n Write(String, Option<String>),\n Append(String),\n}\n\nfn parse_command(input: &str) -> Option<FileOp> {\n let parts: Vec<&str> = input.split_whitespace().collect();\n \n match parts.get(0) {\n Some(&\"read\") => {\n let filename = parts.get(1)?;\n FileOp::Read(filename.to_string()) // BUG: Missing Some()\n }\n Some(&\"write\") => {\n let filename = parts.get(1)?;\n let content = parts.get(2).map(|s| s.to_string());\n Some(FileOp::Write(filename.to_string(), content))\n }\n Some(&\"append\") => {\n let filename = parts.get(1)?;\n // BUG: Missing return statement\n }\n _ => None,\n }\n}\n\nfn main() {\n println!(\"CLI Parser Test\");\n}",
8
  "tests": [
9
  {
10
  "name": "parse_read_command",
 
35
  "title": "Conflicting Borrows in Collection Processing",
36
  "difficulty": "Easy\u2192Medium",
37
  "description": "Fix a function that processes a vector of strings while conditionally modifying elements and storing references for later use. The implementation mixes mutable and immutable borrows within the same scope, causing borrow checker conflicts. Requirements: Iterate through vector of strings, Store uppercase versions in a results vector, Handle optional transformations without borrowing conflicts, Must compile and execute without panics",
38
+ "header_section": "fn process_strings(strings: &mut Vec<String>) -> Vec<String> {\n let mut results = Vec::new();\n \n for s in strings {\n // BUG: Cannot borrow as mutable while immutable borrow is active\n let upper = s.to_uppercase();\n s.push_str(\"_processed\"); // Mutable borrow\n results.push(upper);\n }\n \n results\n}\n\nfn main() {\n println!(\"String processing\");\n}",
39
  "tests": [
40
  {
41
  "name": "process_single_string",
 
57
  "title": "Invalid Lifetime Annotations in Text API",
58
  "difficulty": "Medium",
59
  "description": "Fix a text-processing utility that accepts multiple string slices and returns a reference derived from them. The function either fails to compile or produces incorrect lifetime relationships, risking references that outlive their input data. Requirements: Function must accept multiple &str parameters, Return a &str derived from the inputs, Properly annotate lifetimes, Must be safe (no dangling references)",
60
+ "header_section": "// BUG: Invalid lifetime annotations - which lifetime should the return type use?\nfn longest_text<'a>(s1: &'a str, s2: &'a str) -> &'a str {\n if s1.len() > s2.len() {\n s1\n } else {\n s2\n }\n}\n\n// BUG: This function has a lifetime issue\nfn find_first_word(s: &str) -> &str {\n let bytes = s.as_bytes();\n for (i, &byte) in bytes.iter().enumerate() {\n if byte == b' ' {\n return &s[0..i];\n }\n }\n &s[..]\n}\n\nfn main() {\n println!(\"Lifetime test\");\n}",
61
  "tests": [
62
  {
63
  "name": "longest_text_basic",
 
88
  "title": "Business Logic Producing Incorrect Results",
89
  "difficulty": "Medium",
90
  "description": "Fix a module implementing order validation logic including pricing, discounts, and boundary conditions. The code compiles but produces incorrect outputs for edge cases such as: Zero values, Overlapping discounts, Large numeric inputs, Negative prices. Requirements: Calculate order total correctly, Apply discounts properly (no double-counting), Handle edge cases (zero items, negative values), Be mathematically sound",
91
+ "header_section": "#[derive(Debug, Clone)]\nstruct Order {\n quantity: i32,\n unit_price: f64,\n discount_percent: f64,\n}\n\nimpl Order {\n fn new(quantity: i32, unit_price: f64) -> Self {\n Order {\n quantity,\n unit_price,\n discount_percent: 0.0,\n }\n }\n\n fn with_discount(mut self, discount: f64) -> Self {\n self.discount_percent = discount;\n self\n }\n\n fn calculate_total(&self) -> f64 {\n let subtotal = self.quantity as f64 * self.unit_price;\n // BUG: Incorrect discount calculation\n let discount = subtotal * (self.discount_percent / 100.0);\n subtotal - discount // Missing rounding/validation\n }\n}\n\nfn main() {\n println!(\"Order test\");\n}",
92
  "tests": [
93
  {
94
  "name": "simple_order",
 
119
  "title": "Corrupted Singly Linked List",
120
  "difficulty": "Medium\u2192Hard",
121
  "description": "Fix a custom singly linked list that supports insertion, deletion, and traversal. The implementation incorrectly manages node ownership and pointer transitions, resulting in: Lost nodes, Inconsistent traversal output, Occasional runtime panics. Requirements: Insert elements at head, Delete elements correctly, Traverse without panics, No memory leaks or lost data",
122
+ "header_section": "use std::ptr;\n\n#[derive(Debug)]\nstruct Node<T> {\n value: T,\n next: Option<Box<Node<T>>>,\n}\n\n#[derive(Debug)]\nstruct LinkedList<T> {\n head: Option<Box<Node<T>>>,\n}\n\nimpl<T> LinkedList<T> {\n fn new() -> Self {\n LinkedList { head: None }\n }\n\n fn insert(&mut self, value: T) {\n let new_node = Box::new(Node {\n value,\n next: None, // BUG: Should move self.head into next\n });\n self.head = Some(new_node);\n }\n\n fn len(&self) -> usize {\n let mut count = 0;\n let mut current = &self.head;\n while let Some(node) = current {\n count += 1;\n current = &node.next; // Correct, but insert is broken\n }\n count\n }\n}\n\nfn main() {\n println!(\"LinkedList test\");\n}",
123
  "tests": [
124
  {
125
  "name": "insert_single_element",
 
143
  "title": "Deadlock in Multi-threaded Worker System",
144
  "difficulty": "Hard",
145
  "description": "Fix a worker system using multiple threads to process jobs from a shared queue protected by synchronization primitives. Under certain workloads, threads block indefinitely due to: Improper lock acquisition order, Shared state handling issues, Missing signal/wake mechanisms. Requirements: Spawn N worker threads, Process jobs from shared queue without deadlock, Handle shutdown gracefully, No panics under load",
146
+ "header_section": "use std::sync::{Arc, Mutex, mpsc};\nuse std::thread;\n\nfn worker_system(num_workers: usize, jobs: Vec<i32>) -> Vec<i32> {\n let (tx, rx) = mpsc::channel();\n let rx = Arc::new(Mutex::new(rx));\n let results = Arc::new(Mutex::new(Vec::new()));\n \n let mut handles = vec![];\n \n for _ in 0..num_workers {\n let rx = Arc::clone(&rx);\n let results = Arc::clone(&results);\n \n let handle = thread::spawn(move || {\n loop {\n // BUG: Lock acquired but never released before trying to acquire results lock\n let receiver = rx.lock().unwrap();\n match receiver.try_recv() {\n Ok(job) => {\n let result = job * 2;\n // BUG: Tries to lock results while still holding rx lock - DEADLOCK\n results.lock().unwrap().push(result);\n }\n Err(_) => break,\n }\n }\n });\n handles.push(handle);\n }\n \n for job in jobs {\n let _ = tx.send(job); // Ignore send errors\n }\n drop(tx);\n \n for handle in handles {\n let _ = handle.join();\n }\n \n Arc::try_unwrap(results)\n .unwrap()\n .into_inner()\n .unwrap()\n}\n\nfn main() {\n println!(\"Worker system test\");\n}",
147
  "tests": [
148
  {
149
  "name": "single_worker_single_job",
 
167
  "title": "Async Function with Borrowing Conflicts",
168
  "difficulty": "Hard",
169
  "description": "Fix an asynchronous function that processes input data and performs non-blocking operations while returning references tied to the input. The implementation violates borrowing constraints in an async context, leading to: Compilation errors when using references across await points, Invalid reference usage. Requirements: Accept &str input, Perform async operation, Return derived reference, Must be sound and compile",
170
+ "header_section": "use std::pin::Pin;\nuse std::future::Future;\n\n// BUG: Cannot return reference that outlives await point\nasync fn process_async(input: &str) -> &str {\n // Simulating async work\n // tokio::time::sleep(tokio::time::Duration::from_millis(10)).await;\n \n // BUG: input reference cannot be returned from async context like this\n input\n}\n\n// Better approach: return owned data or 'static reference\nfn process_sync(input: &str) -> String {\n input.to_uppercase()\n}\n\nfn main() {\n println!(\"Async test\");\n}",
171
  "tests": [
172
  {
173
  "name": "process_sync_basic",
 
191
  "title": "Unsafe FFI Integration Causing Crashes",
192
  "difficulty": "Hard",
193
  "description": "Fix Rust code that interfaces with an external C library using raw pointers. The implementation incorrectly handles: Pointer ownership, Memory allocation and deallocation, Undefined behavior risks. Requirements: Safely wrap C library calls, Properly manage memory (allocate/deallocate), No undefined behavior, Handle errors gracefully",
194
+ "header_section": "extern \"C\" {\n fn malloc(size: usize) -> *mut u8;\n fn free(ptr: *mut u8);\n}\n\nfn allocate_and_init(size: usize) -> Vec<u8> {\n unsafe {\n let ptr = malloc(size);\n // BUG: No null check - ptr could be null\n // BUG: Memory not initialized before use\n let slice = std::slice::from_raw_parts_mut(ptr, size);\n \n // Copy to vec and free\n let vec = slice.to_vec();\n free(ptr); // BUG: Freeing memory still referenced in vec\n vec\n }\n}\n\nfn main() {\n println!(\"FFI test\");\n}",
195
  "tests": [
196
  {
197
  "name": "allocate_small_buffer",
 
208
  "title": "Inefficient Data Processing Pipeline",
209
  "difficulty": "Hard",
210
  "description": "Fix a data pipeline that reads large datasets, applies transformations, and aggregates results. While functionally correct, the implementation has: Excessive memory allocations, Redundant iterations, Inefficient data copying. Requirements: Process data efficiently, Minimize allocations and copies, Use iterators when possible, Produce correct results with better performance",
211
+ "header_section": "fn process_data(numbers: Vec<i32>) -> i32 {\n // BUG: Multiple unnecessary allocations and iterations\n \n // First pass: filter evens (allocates new vector)\n let evens: Vec<i32> = numbers.iter()\n .filter(|n| n % 2 == 0)\n .copied()\n .collect();\n \n // Second pass: double values (allocates another vector)\n let doubled: Vec<i32> = evens.iter()\n .map(|n| n * 2)\n .collect();\n \n // Third pass: sum (unnecessary iteration)\n let sum: i32 = doubled.iter().sum();\n \n // Fourth pass: filter again (redundant)\n let final_sum: i32 = doubled.iter()\n .filter(|n| n % 4 == 0)\n .sum();\n \n final_sum\n}\n\nfn main() {\n println!(\"Efficiency test\");\n}",
212
  "tests": [
213
  {
214
  "name": "simple_pipeline",
 
232
  "title": "Reference-counted Cache with Memory Leak",
233
  "difficulty": "Hard+",
234
  "description": "Fix a caching system using reference-counted pointers to share data across components. The design creates cyclic references between cached objects, preventing memory from being released and causing memory usage to grow over time. Requirements: Implement caching without memory leaks, Break circular reference patterns, Use Rc/Arc correctly with Weak pointers when needed, Memory should be released when cache is cleared",
235
+ "header_section": "use std::rc::Rc;\nuse std::cell::RefCell;\n\n#[derive(Debug)]\nstruct CacheNode<T> {\n key: String,\n value: T,\n // BUG: This creates a cycle that prevents garbage collection\n related: RefCell<Option<Rc<CacheNode<T>>>>,\n}\n\n#[derive(Debug)]\nstruct Cache<T> {\n items: RefCell<Vec<Rc<CacheNode<T>>>>,\n}\n\nimpl<T: Clone> Cache<T> {\n fn new() -> Self {\n Cache {\n items: RefCell::new(Vec::new()),\n }\n }\n\n fn insert(&self, key: String, value: T) {\n let node = Rc::new(CacheNode {\n key,\n value,\n related: RefCell::new(None),\n });\n \n // BUG: Creating cyclic references\n if let Some(last) = self.items.borrow().last() {\n // Rc to Rc creates a cycle\n if let Ok(mut r) = last.related.try_borrow_mut() {\n *r = Some(Rc::clone(&node)); // Cycle here!\n }\n }\n \n self.items.borrow_mut().push(node);\n }\n}\n\nfn main() {\n println!(\"Cache test\");\n}",
236
  "tests": [
237
  {
238
  "name": "cache_insert_single",
server/app.py CHANGED
@@ -1,20 +1,19 @@
1
  """
2
- FastAPI application for the Rust Coder Environment.
3
 
4
- Endpoints:
5
- POST /reset β€” Start new episode (loads next problem)
6
- POST /step β€” Submit Rust code for evaluation
7
- GET /state β€” Get current episode state
8
- GET /schema β€” Action/observation JSON schemas
9
- WS /ws β€” WebSocket for persistent sessions
 
 
10
  """
11
 
12
  import os
13
  import logging
14
- import json
15
- import time
16
- import gradio as gr
17
- from openai import OpenAI
18
  from dotenv import load_dotenv
19
  from openenv.core.env_server.http_server import create_app
20
 
@@ -23,38 +22,13 @@ from server.rust_coder_environment import RustCoderEnvironment
23
 
24
  load_dotenv()
25
 
26
- # --- Logging (server/app.py) ---
27
  _LOG_LEVEL = (os.getenv("LOG_LEVEL") or "INFO").upper()
28
  logging.basicConfig(
29
  level=getattr(logging, _LOG_LEVEL, logging.INFO),
30
  format="%(asctime)s %(levelname)s %(name)s - %(message)s",
31
  )
32
- logger = logging.getLogger("rust_coder.server")
33
 
34
- # #region agent log
35
- _DEBUG_LOG_PATH = os.getenv("DEBUG_LOG_PATH") or "debug-55b5ef.log"
36
- _DEBUG_SESSION_ID = "55b5ef"
37
- def _dbg(hypothesis_id: str, location: str, message: str, data: dict, run_id: str = "pre-fix") -> None:
38
- try:
39
- payload = {
40
- "sessionId": _DEBUG_SESSION_ID,
41
- "runId": run_id,
42
- "hypothesisId": hypothesis_id,
43
- "location": location,
44
- "message": message,
45
- "data": data,
46
- "timestamp": int(time.time() * 1000),
47
- }
48
- with open(_DEBUG_LOG_PATH, "a", encoding="utf-8") as f:
49
- f.write(json.dumps(payload, ensure_ascii=False) + "\n")
50
- except Exception:
51
- # Never break app for debug logging
52
- pass
53
- # #endregion
54
-
55
- # --- Core OpenEnv Server Setup ---
56
- # Use a distinct name for the OpenEnv FastAPI instance
57
- openenv_app = create_app(
58
  RustCoderEnvironment,
59
  RustCoderAction,
60
  RustCoderObservation,
@@ -62,198 +36,15 @@ openenv_app = create_app(
62
  max_concurrent_envs=1,
63
  )
64
 
65
- # Add a health check endpoint for Docker directly to the base app
66
- @openenv_app.get("/health")
67
  async def health_check():
68
  return {"status": "healthy"}
69
 
70
- # --- Shared Logic ---
71
- API_BASE_URL = os.getenv("API_BASE_URL") or "https://router.huggingface.co/v1"
72
- MODEL_NAME = os.getenv("MODEL_NAME") or "Qwen/Qwen2.5-72B-Instruct"
73
- HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
74
-
75
- def get_llm_solution(problem_desc: str):
76
- """Call LLM to get a Rust solution"""
77
- try:
78
- _dbg(
79
- "H2",
80
- "server/app.py:get_llm_solution:entry",
81
- "LLM call starting",
82
- {"model": MODEL_NAME, "base_url": API_BASE_URL, "prompt_chars": len(problem_desc or ""), "token_present": bool(HF_TOKEN)},
83
- )
84
- logger.info(
85
- "LLM call start model=%s base_url=%s prompt_chars=%d token_present=%s",
86
- MODEL_NAME,
87
- API_BASE_URL,
88
- len(problem_desc or ""),
89
- bool(HF_TOKEN),
90
- )
91
- client_llm = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN)
92
- completion = client_llm.chat.completions.create(
93
- model=MODEL_NAME,
94
- messages=[
95
- {"role": "system", "content": "You are an expert Rust developer. Respond ONLY with the code solution, no explanation."},
96
- {"role": "user", "content": f"Fix the following Rust problem:\n{problem_desc}"},
97
- ],
98
- temperature=0.2,
99
- )
100
- text = (completion.choices[0].message.content or "").strip()
101
- logger.debug("LLM raw response chars=%d", len(text))
102
- # Clean markdown code blocks
103
- if "```rust" in text:
104
- text = text.split("```rust")[1].split("```")[0]
105
- elif "```" in text:
106
- text = text.split("```")[1].split("```")[0]
107
- text = text.strip()
108
- if not text:
109
- _dbg("H2", "server/app.py:get_llm_solution:empty", "LLM returned empty after cleanup", {"raw_chars": len((completion.choices[0].message.content or ""))})
110
- logger.warning("LLM returned empty code after cleanup.")
111
- return "// LLM Error: empty response (no code returned)."
112
- _dbg("H2", "server/app.py:get_llm_solution:exit", "LLM call finished", {"returned_code_chars": len(text)})
113
- logger.info("LLM call end: returned_code_chars=%d", len(text))
114
- return text
115
- except Exception as e:
116
- _dbg("H2", "server/app.py:get_llm_solution:error", "LLM call exception", {"error": str(e)})
117
- logger.exception("LLM call failed.")
118
- return f"// LLM Error: {e}"
119
-
120
- def evaluate_single(problem_id, code=None):
121
- """Run evaluation for a specific problem. If code is None, it asks the LLM."""
122
- try:
123
- idx = int(problem_id.split(":")[0]) - 1
124
- problem = RustCoderEnvironment().problems[idx]
125
- _dbg(
126
- "H2",
127
- "server/app.py:evaluate_single:entry",
128
- "evaluate_single called",
129
- {"problem_id": str(problem_id), "idx": idx, "code_is_none": code is None, "code_chars": len(code or "")},
130
- )
131
- logger.info(
132
- "evaluate_single start problem_id=%s idx=%d code_provided=%s",
133
- problem_id,
134
- idx,
135
- code is not None,
136
- )
137
-
138
- # 1. Get code from LLM if not provided
139
- solution_code = code if code else get_llm_solution(problem["description"])
140
-
141
- # 2. Guard: If LLM failed, do not evaluate
142
- if not solution_code.strip() or solution_code.startswith("// LLM Error"):
143
- _dbg(
144
- "H2",
145
- "server/app.py:evaluate_single:abort",
146
- "evaluate_single abort due to empty/error code",
147
- {"starts_with_llm_error": solution_code.startswith("// LLM Error"), "solution_code_chars": len(solution_code or "")},
148
- )
149
- logger.warning(
150
- "evaluate_single abort: empty_or_error_code=%s chars=%d",
151
- solution_code.startswith("// LLM Error"),
152
- len(solution_code or ""),
153
- )
154
- return solution_code, {"error": "LLM failed to generate a solution. Check your HF_TOKEN."}
155
-
156
- # 3. Evaluate properly
157
- env = RustCoderEnvironment()
158
- # Reset to the specifically requested index
159
- state = env.reset(start_index=idx)
160
- logger.debug("evaluate_single step() submitting chars=%d", len(solution_code))
161
- state = env.step(RustCoderAction(code=solution_code))
162
- logger.info(
163
- "evaluate_single end reward=%.4f compilation_success=%s",
164
- float(state.reward or 0.0),
165
- bool(state.compilation_success),
166
- )
167
-
168
- metrics = {
169
- "Total Reward": f"{state.reward:.2f}",
170
- "Compilation": "Success" if state.compilation_success else "Failed",
171
- "Metrics": state.reward_breakdown
172
- }
173
- return solution_code, metrics
174
- except Exception as e:
175
- logger.exception("evaluate_single crashed.")
176
- return f"// Error: {e}", {"error": f"Evaluation system error: {e}"}
177
-
178
- def run_benchmark(progress=gr.Progress()):
179
- """Run all 10 problems through the LLM and show summary"""
180
- try:
181
- env = RustCoderEnvironment()
182
- rows = []
183
- total_score = 0.0
184
-
185
- # Check if token is actually present
186
- test_token = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
187
- if not test_token:
188
- return "## Error: HF_TOKEN is not set. Add it to your HF Space secrets or local .env file.", []
189
-
190
- for i in range(len(env.problems)):
191
- progress(i/len(env.problems), desc=f"Benchmarking Task {i+1}...")
192
- problem = env.problems[i]
193
- code = get_llm_solution(problem["description"])
194
-
195
- reward = 0.0
196
- compiled = "Failed (LLM Error)"
197
-
198
- if not code.startswith("// LLM Error"):
199
- env.reset(start_index=i)
200
- state = env.step(RustCoderAction(code=code))
201
- reward = state.reward
202
- compiled = "Success" if state.compilation_success else "Failed"
203
-
204
- rows.append([problem["id"], problem["title"], problem.get("difficulty", "N/A"), f"{reward:.2f}", compiled])
205
- total_score += reward
206
-
207
- avg_score = total_score / len(env.problems)
208
- summary_md = f"## Benchmark Summary\n**Final Environment Score: {avg_score:.2f} / 1.0**"
209
- return summary_md, rows
210
- except Exception as e:
211
- return f"### Benchmark Error: {e}", []
212
-
213
- # --- Build the Gradio UI ---
214
- def create_dashboard():
215
- with gr.Blocks(title="Rust Coder Evaluation Dashboard") as demo:
216
- gr.Markdown("# πŸ¦€ Rust Coder: LLM Evaluation Dashboard")
217
-
218
- with gr.Tab("Individual Task Evaluation"):
219
- with gr.Row():
220
- with gr.Column(scale=1):
221
- p_env = RustCoderEnvironment()
222
- p_list = [f"{p['id']}: {p['title']} ({p.get('difficulty', 'N/A')})" for p in p_env.problems]
223
- dropdown = gr.Dropdown(choices=p_list, label="Select Question", value=p_list[0])
224
- desc = gr.Markdown(value=f"### Question [{p_env.problems[0].get('difficulty', 'N/A')}]\n{p_env.problems[0]['description']}")
225
-
226
- with gr.Column(scale=1):
227
- run_llm_btn = gr.Button("Generate Solution & Evaluate", variant="primary")
228
- code_display = gr.Code(label="AI Generated Solution", interactive=False)
229
- results_json = gr.JSON(label="Metric Breakdown")
230
-
231
- def update_desc(p_str):
232
- idx = int(p_str.split(":")[0]) - 1
233
- p = p_env.problems[idx]
234
- return f"### Question [{p.get('difficulty', 'N/A')}]\n{p['description']}", "" # Clear solution on change
235
-
236
- dropdown.change(update_desc, inputs=[dropdown], outputs=[desc, code_display])
237
- run_llm_btn.click(evaluate_single, inputs=[dropdown], outputs=[code_display, results_json])
238
-
239
- with gr.Tab("Full Environment Benchmark"):
240
- gr.Markdown("### Complete Environment Suite")
241
- gr.Markdown("Runs the LLM against all 10 tasks sequentially to determine the global OpenEnv score.")
242
-
243
- b_summarize = gr.Button("Run Performance Benchmark", variant="stop")
244
- b_sum = gr.Markdown()
245
- b_grid = gr.Dataframe(headers=["ID", "Title", "Difficulty", "Reward", "Compiled"], label="Task Results")
246
-
247
- b_summarize.click(run_benchmark, outputs=[b_sum, b_grid])
248
-
249
- return demo
250
-
251
- # Final consolidated Gradio App mounted on the FastAPI server
252
- app = gr.mount_gradio_app(openenv_app, create_dashboard(), path="/")
253
 
254
  def main(host: str = "0.0.0.0", port: int = 8000) -> None:
255
- """Entry point: uv run server or python -m server.app"""
256
  import uvicorn
 
257
  uvicorn.run(app, host=host, port=port)
258
 
259
 
 
1
  """
2
+ FastAPI application for the Rust Coder OpenEnv environment.
3
 
4
+ This module is the Hugging Face Space entrypoint (see `openenv.yaml` and Docker `CMD`).
5
+
6
+ Endpoints (provided by OpenEnv `create_app`):
7
+ - POST /reset
8
+ - POST /step
9
+ - GET /state
10
+ - GET /schema
11
+ - WS /ws
12
  """
13
 
14
  import os
15
  import logging
16
+
 
 
 
17
  from dotenv import load_dotenv
18
  from openenv.core.env_server.http_server import create_app
19
 
 
22
 
23
  load_dotenv()
24
 
 
25
  _LOG_LEVEL = (os.getenv("LOG_LEVEL") or "INFO").upper()
26
  logging.basicConfig(
27
  level=getattr(logging, _LOG_LEVEL, logging.INFO),
28
  format="%(asctime)s %(levelname)s %(name)s - %(message)s",
29
  )
 
30
 
31
+ app = create_app(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  RustCoderEnvironment,
33
  RustCoderAction,
34
  RustCoderObservation,
 
36
  max_concurrent_envs=1,
37
  )
38
 
39
+
40
+ @app.get("/health")
41
  async def health_check():
42
  return {"status": "healthy"}
43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
  def main(host: str = "0.0.0.0", port: int = 8000) -> None:
 
46
  import uvicorn
47
+
48
  uvicorn.run(app, host=host, port=port)
49
 
50
 
server/rust_coder_environment.py CHANGED
@@ -129,7 +129,7 @@ class RustCoderEnvironment(Environment):
129
 
130
  return RustCoderObservation(
131
  problem_description=problem["description"],
132
- starter_code=problem.get("starter_code", ""),
133
  compilation_success=False,
134
  compilation_output="",
135
  test_results=[],
@@ -143,6 +143,7 @@ class RustCoderEnvironment(Environment):
143
  self.step_count += 1
144
  problem = self.problems[self.current_problem_idx]
145
  code = action.code
 
146
 
147
  self._dbg(
148
  "H1",
@@ -169,9 +170,9 @@ class RustCoderEnvironment(Environment):
169
  base_url = os.getenv("API_BASE_URL") or "https://router.huggingface.co/v1"
170
  token = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
171
  prompt = problem.get("description", "")
172
- starter = problem.get("starter_code", "")
173
- if starter:
174
- prompt += f"\n\nStarter Code:\n```rust\n{starter}\n```"
175
 
176
  self._dbg(
177
  "H5",
@@ -191,7 +192,7 @@ class RustCoderEnvironment(Environment):
191
  self._logger.error("AUTO_LLM_ON_EMPTY_STEP enabled but HF_TOKEN/API_KEY missing.")
192
  return RustCoderObservation(
193
  problem_description=problem.get("description", ""),
194
- starter_code=problem.get("starter_code", ""),
195
  compilation_success=False,
196
  compilation_output="Error: AUTO_LLM_ON_EMPTY_STEP enabled but HF_TOKEN/API_KEY is missing.",
197
  test_results=[],
@@ -210,8 +211,8 @@ class RustCoderEnvironment(Environment):
210
  completion = client_llm.chat.completions.create(
211
  model=model,
212
  messages=[
213
- {"role": "system", "content": "You are a senior Rust engineer. Return ONLY the complete fixed Rust code. No explanation."},
214
- {"role": "user", "content": prompt},
215
  ],
216
  temperature=0.1,
217
  )
@@ -264,7 +265,7 @@ class RustCoderEnvironment(Environment):
264
  done = False
265
  return RustCoderObservation(
266
  problem_description=problem["description"],
267
- starter_code=problem.get("starter_code", ""),
268
  compilation_success=False,
269
  compilation_output="Error: no code submitted.",
270
  test_results=[],
@@ -279,6 +280,28 @@ class RustCoderEnvironment(Environment):
279
  reward=0.0,
280
  )
281
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
282
  # ── 1. Compilation (40%) ──────────────────────────────────────
283
  compilation_success, compilation_output = self._compile_check(code)
284
  r_compilation = 1.0 if compilation_success else 0.0
@@ -302,7 +325,9 @@ class RustCoderEnvironment(Environment):
302
  r_coverage = 1.0
303
 
304
  # ── 3. Elegance (10%) ─────────────────────────────────────────
305
- r_elegance = self._score_elegance(code)
 
 
306
 
307
  # ── 4. Efficiency (10%) ───────────────────────────────────────
308
  baseline_ms: float = problem.get("performance_baseline_ms", 100.0)
@@ -318,30 +343,43 @@ class RustCoderEnvironment(Environment):
318
  "elegance": round(r_elegance, 4),
319
  "efficiency": round(r_efficiency, 4),
320
  }
321
- # Calculate weighted total reward
322
- total_reward = round(
323
- r_compilation * 0.40
324
- + r_correctness * 0.20
325
- + r_coverage * 0.20
326
- + r_elegance * 0.10
327
- + r_efficiency * 0.10,
328
- 4,
329
- )
 
 
 
 
330
 
331
  # ── Advance Logic ─────────────────────────────────────────────
332
  self.current_problem_idx += 1
333
  done = self.current_problem_idx >= len(self.problems)
334
 
335
  next_prob_desc = "--- ALL TASKS COMPLETED in this episode ---"
336
- next_starter = ""
337
  if not done:
338
  next_prob = self.problems[self.current_problem_idx]
339
  next_prob_desc = f"--- NEXT TASK: {next_prob['title']} ---\n\n{next_prob['description']}"
340
- next_starter = next_prob.get("starter_code", "")
 
 
 
 
 
 
 
 
 
341
 
342
  return RustCoderObservation(
343
- problem_description=next_prob_desc,
344
- starter_code=next_starter,
345
  compilation_success=compilation_success,
346
  compilation_output=compilation_output[:2000], # cap length
347
  test_results=test_results,
 
129
 
130
  return RustCoderObservation(
131
  problem_description=problem["description"],
132
+ header_section=problem.get("header_section", ""),
133
  compilation_success=False,
134
  compilation_output="",
135
  test_results=[],
 
143
  self.step_count += 1
144
  problem = self.problems[self.current_problem_idx]
145
  code = action.code
146
+ header = problem.get("header_section", "")
147
 
148
  self._dbg(
149
  "H1",
 
170
  base_url = os.getenv("API_BASE_URL") or "https://router.huggingface.co/v1"
171
  token = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
172
  prompt = problem.get("description", "")
173
+ header = problem.get("header_section", "")
174
+ if header:
175
+ prompt += f"\n\nHeader Section (must be included verbatim in your final code):\n```rust\n{header}\n```"
176
 
177
  self._dbg(
178
  "H5",
 
192
  self._logger.error("AUTO_LLM_ON_EMPTY_STEP enabled but HF_TOKEN/API_KEY missing.")
193
  return RustCoderObservation(
194
  problem_description=problem.get("description", ""),
195
+ header_section=problem.get("header_section", ""),
196
  compilation_success=False,
197
  compilation_output="Error: AUTO_LLM_ON_EMPTY_STEP enabled but HF_TOKEN/API_KEY is missing.",
198
  test_results=[],
 
211
  completion = client_llm.chat.completions.create(
212
  model=model,
213
  messages=[
214
+ {"role": "system", "content": "You are a senior Rust engineer. Return ONLY the complete Rust code file. No explanation."},
215
+ {"role": "user", "content": prompt + "\n\nReturn the COMPLETE Rust code, including the header section above."},
216
  ],
217
  temperature=0.1,
218
  )
 
265
  done = False
266
  return RustCoderObservation(
267
  problem_description=problem["description"],
268
+ header_section=problem.get("header_section", ""),
269
  compilation_success=False,
270
  compilation_output="Error: no code submitted.",
271
  test_results=[],
 
280
  reward=0.0,
281
  )
282
 
283
+ # Do NOT mutate submissions by injecting header_section.
284
+ # LeetCode-style behavior: the agent/LLM must return a complete Rust file
285
+ # that already includes the required header_section.
286
+ if header and header.strip() and header.strip() not in (code or ""):
287
+ done = False
288
+ return RustCoderObservation(
289
+ problem_description=problem.get("description", ""),
290
+ header_section=header,
291
+ compilation_success=False,
292
+ compilation_output="Error: submission is missing the required header_section. Return the complete Rust code including the header_section.",
293
+ test_results=[],
294
+ reward_breakdown={
295
+ "compilation": 0.0,
296
+ "correctness": 0.0,
297
+ "coverage": 0.0,
298
+ "elegance": 0.0,
299
+ "efficiency": 0.0,
300
+ },
301
+ done=done,
302
+ reward=0.0,
303
+ )
304
+
305
  # ── 1. Compilation (40%) ──────────────────────────────────────
306
  compilation_success, compilation_output = self._compile_check(code)
307
  r_compilation = 1.0 if compilation_success else 0.0
 
325
  r_coverage = 1.0
326
 
327
  # ── 3. Elegance (10%) ─────────────────────────────────────────
328
+ # Only score elegance for code that compiles; otherwise it can
329
+ # incorrectly award points for non-compiling submissions.
330
+ r_elegance = self._score_elegance(code) if compilation_success else 0.0
331
 
332
  # ── 4. Efficiency (10%) ───────────────────────────────────────
333
  baseline_ms: float = problem.get("performance_baseline_ms", 100.0)
 
343
  "elegance": round(r_elegance, 4),
344
  "efficiency": round(r_efficiency, 4),
345
  }
346
+ # Calculate weighted total reward.
347
+ # Hard rule: if it doesn't compile, total reward must be 0.0.
348
+ if not compilation_success:
349
+ total_reward = 0.0
350
+ else:
351
+ total_reward = round(
352
+ r_compilation * 0.40
353
+ + r_correctness * 0.20
354
+ + r_coverage * 0.20
355
+ + r_elegance * 0.10
356
+ + r_efficiency * 0.10,
357
+ 4,
358
+ )
359
 
360
  # ── Advance Logic ─────────────────────────────────────────────
361
  self.current_problem_idx += 1
362
  done = self.current_problem_idx >= len(self.problems)
363
 
364
  next_prob_desc = "--- ALL TASKS COMPLETED in this episode ---"
365
+ next_header = ""
366
  if not done:
367
  next_prob = self.problems[self.current_problem_idx]
368
  next_prob_desc = f"--- NEXT TASK: {next_prob['title']} ---\n\n{next_prob['description']}"
369
+ next_header = next_prob.get("header_section", "")
370
+
371
+ # IMPORTANT: The compilation/test results correspond to the code evaluated
372
+ # on `problem` (the current task), while the UI should also know what's next.
373
+ # To avoid confusion, include both "evaluated" and "next" in the description.
374
+ response_problem_desc = (
375
+ f"--- EVALUATED TASK: {problem.get('title', '')} ---\n\n"
376
+ f"{problem.get('description', '')}\n\n"
377
+ f"{next_prob_desc}"
378
+ )
379
 
380
  return RustCoderObservation(
381
+ problem_description=response_problem_desc,
382
+ header_section=next_header,
383
  compilation_success=compilation_success,
384
  compilation_output=compilation_output[:2000], # cap length
385
  test_results=test_results,