| name: code-review-agent-env |
| version: 1.0.0 |
| description: | |
| A realistic code review environment where AI agents review pull requests, |
| identify issues, suggest improvements, and make approval decisions. |
| Models real-world software development workflows. |
| |
| authors: |
| - Ashish <ashishkbaberwal@gmail.com> |
| - Shardul <shardulmd@gmail.com> |
| - Harshit <shakyanitin807@gmail.com> |
| tags: |
| - code-review |
| - software-engineering |
| - agent-evaluation |
| - real-world-task |
|
|
| license: MIT |
|
|
| environment: |
| class: environment.env.CodeReviewEnv |
| entry_point: environment.env:CodeReviewEnv |
|
|
| tasks: |
| - id: bug_detection_easy_1 |
| name: "Easy: Detect Division by Zero" |
| difficulty: easy |
| grader: |
| type: deterministic |
| endpoint: /score |
|
|
| - id: bug_detection_easy_2 |
| name: "Easy: Off-by-One Error" |
| difficulty: easy |
| grader: |
| type: deterministic |
| endpoint: /score |
|
|
| - id: approve_easy_3 |
| name: "Easy: Approve Safe Refactor" |
| difficulty: easy |
| grader: |
| type: deterministic |
| endpoint: /score |
|
|
| - id: memory_leak_medium_1 |
| name: "Medium: Memory Leak Detection" |
| difficulty: medium |
| grader: |
| type: deterministic |
| endpoint: /score |
|
|
| - id: performance_medium_2 |
| name: "Medium: String Concatenation Performance" |
| difficulty: medium |
| grader: |
| type: deterministic |
| endpoint: /score |
|
|
| - id: approve_medium_3 |
| name: "Medium: Approve Safe Query Helper" |
| difficulty: medium |
| grader: |
| type: deterministic |
| endpoint: /score |
|
|
| - id: type_safety_medium_4 |
| name: "Medium: Type Safety Optional Arithmetic" |
| difficulty: medium |
| grader: |
| type: deterministic |
| endpoint: /score |
|
|
| - id: javascript_medium_5 |
| name: "Medium: JavaScript Undefined Access" |
| difficulty: medium |
| grader: |
| type: deterministic |
| endpoint: /score |
|
|
| - id: security_hard_1 |
| name: "Hard: SQL Injection Vulnerability" |
| difficulty: hard |
| grader: |
| type: deterministic |
| endpoint: /score |
|
|
| - id: race_condition_hard_2 |
| name: "Hard: Race Condition" |
| difficulty: hard |
| grader: |
| type: deterministic |
| endpoint: /score |
|
|
| - id: approve_hard_3 |
| name: "Hard: Approve Thread-Safe Counter" |
| difficulty: hard |
| grader: |
| type: deterministic |
| endpoint: /score |
|
|
| - id: adversarial_hard_4 |
| name: "Hard: Adversarial Safe SQL Builder" |
| difficulty: hard |
| grader: |
| type: deterministic |
| endpoint: /score |
|
|
| - id: concurrency_hard_5 |
| name: "Hard: Async Await Misuse" |
| difficulty: hard |
| grader: |
| type: deterministic |
| endpoint: /score |
|
|
| - id: dependency_injection_hard_6 |
| name: "Hard: Tight Coupling in Service" |
| difficulty: hard |
| grader: |
| type: deterministic |
| endpoint: /score |
|
|
| observation_space: |
| type: dict |
| description: | |
| Contains code diff, file context, and review status |
| |
| action_space: |
| type: dict |
| description: | |
| Actions include adding comments, suggesting fixes, approving, or requesting changes |
| |
| reward_range: |
| min: -0.5 |
| max: 1.0 |
|
|
| max_episode_steps: 50 |
|
|
| requires_api_keys: |
| - API_KEY |
| - API_BASE_URL |
| - MODEL_NAME |