Spaces:

100XZX001
/

code-review-openenv

Sleeping

App Files Files Community

100XZX001 commited on 26 days ago

Commit

8846f87

verified ·

1 Parent(s): 04362f9

Update environment.py

Browse files

Files changed (1) hide show

environment.py +128 -22

environment.py CHANGED Viewed

@@ -1,16 +1,127 @@
-from typing import Tuple, Dict, Any
 from models import Observation, Action, Reward, State
 from grader import grade_comment, grade_question, grade_fix
 class CodeReviewEnv:
     def __init__(self, task: str = "easy"):
         self.task = task
         self.reset()
     def set_task(self, task: str):
         if task not in ["easy", "medium", "hard", "harder", "hardest"]:
             raise ValueError(f"Unknown task: {task}")
         self.task = task
     def reset(self) -> Observation:
         if self.task is None:
@@ -76,6 +187,10 @@ class CodeReviewEnv:
             reward = 0.2  # dense bonus for writing
             quality_score = grade_comment(self.agent_comment, self.expected_keywords, self.expert_comment)
             reward += quality_score
             self.done = True
         elif action.action_type == "ask_question":
@@ -83,11 +198,11 @@ class CodeReviewEnv:
                 reward = -0.1
             else:
                 q_score = grade_question(action.question)
-                reward = 0.1 + q_score   # small bonus + quality
-                # Simulate a helpful answer
-                answer = self._answer_question(action.question)
                 self.comments.append(f"Agent: {action.question}")
-                self.comments.append(f"Env: {answer}")
                 self.step_count += 1
                 # Episode continues, not done
@@ -95,11 +210,14 @@ class CodeReviewEnv:
             if not action.fix_code:
                 reward = -0.2
             else:
-                # We'll use a simple keyword check for demonstration
-                # In a full version, you'd run unit tests
-                fix_score = grade_fix(action.fix_code, self.expected_fix_keywords, None)
-                reward = 0.3 + fix_score
-                self.test_results = f"Fix evaluated with score {fix_score:.2f}"
             self.done = True
         elif action.action_type == "skip":
@@ -116,18 +234,6 @@ class CodeReviewEnv:
         obs = self._get_observation()
         return obs, Reward(value=reward), self.done, info
-    def _answer_question(self, question: str) -> str:
-        # Simple rule‑based answers – you can expand
-        q = question.lower()
-        if "what" in q and "purpose" in q:
-            return "The purpose of this function is to retrieve a user by ID from a dictionary."
-        elif "expected" in q:
-            return "The function should return the user object if the ID exists, otherwise raise a KeyError."
-        elif "how" in q and "fix" in q:
-            return "You might consider adding a check for missing keys or using a safer dictionary method like `get`."
-        else:
-            return "I'm not sure. Could you be more specific?"
     def _get_observation(self) -> Observation:
         return Observation(
             pr_title=self.pr_title,

+from typing import Tuple, Dict, Any, List, Optional
 from models import Observation, Action, Reward, State
 from grader import grade_comment, grade_question, grade_fix
+import sys
+import io
+import contextlib
+# ------------------------- Simulated CI / Unit tests -------------------------
+def run_unit_tests(fix_code: str, task: str) -> float:
+    """
+    Runs a small set of unit tests for the given task.
+    Returns a score in [0,1] based on passed tests.
+    """
+    # Define tests per task
+    test_code = ""
+    if task == "easy":
+        # Test that the function handles missing keys
+        test_code = f"""
+{fix_code}
+def test():
+    try:
+        users = {{"alice": "Alice"}}
+        result = get_user("bob")
+        return False  # should not get here if key missing
+    except KeyError:
+        return True  # expected: KeyError
+    except Exception:
+        return False
+"""
+    elif task == "medium":
+        test_code = f"""
+{fix_code}
+def test():
+    items = [1,2,3]
+    # We cannot directly test the loop, but we can check that 'process' is called correctly.
+    # For demonstration, we'll assume the fix uses 'enumerate' or 'for item in'.
+    # Here we just check that the code compiles and runs without error.
+    try:
+        exec(compile("{fix_code}", "<string>", "exec"))
+        return True
+    except Exception:
+        return False
+"""
+    elif task == "hard":
+        test_code = f"""
+{fix_code}
+def test():
+    # Test empty list
+    try:
+        result = calculate_average([])
+        return result == 0  # expect 0 or some default
+    except ZeroDivisionError:
+        return False
+"""
+    elif task == "harder":
+        test_code = f"""
+{fix_code}
+def test():
+    # Check that a lock is used
+    if "lock" in "{fix_code}".lower():
+        return True
+    return False
+"""
+    else:  # hardest
+        test_code = f"""
+{fix_code}
+def test():
+    # Check for lock order mention
+    if "same order" in "{fix_code}".lower() or "lock order" in "{fix_code}".lower():
+        return True
+    return False
+"""
+    # Execute the test in a safe sandbox
+    try:
+        # Capture stdout/stderr
+        f = io.StringIO()
+        with contextlib.redirect_stdout(f), contextlib.redirect_stderr(f):
+            exec(test_code, {})
+        # Check if test function returns True
+        local_ns = {}
+        exec(test_code, {}, local_ns)
+        if 'test' in local_ns and callable(local_ns['test']):
+            passed = local_ns['test']()
+            return 1.0 if passed else 0.0
+        else:
+            return 0.0
+    except Exception:
+        return 0.0
+# ------------------------- Simulated PR Author -------------------------
+class SimulatedAuthor:
+    """Responds to the agent's questions and comments as if they were the PR author."""
+    def __init__(self, task: str):
+        self.task = task
+    def respond(self, agent_comment: str, agent_question: str = None) -> str:
+        if agent_question:
+            q = agent_question.lower()
+            if "what" in q and "purpose" in q:
+                return "The purpose is to retrieve a user safely."
+            elif "expected" in q:
+                return "It should return the user or raise KeyError."
+            else:
+                return "Could you be more specific?"
+        else:
+            # Generic response to a comment
+            if "good" in agent_comment.lower():
+                return "Thanks for the feedback!"
+            else:
+                return "I'll consider your suggestion."
+# ------------------------- Main Environment -------------------------
 class CodeReviewEnv:
     def __init__(self, task: str = "easy"):
         self.task = task
+        self.author = None
         self.reset()
     def set_task(self, task: str):
         if task not in ["easy", "medium", "hard", "harder", "hardest"]:
             raise ValueError(f"Unknown task: {task}")
         self.task = task
+        self.author = SimulatedAuthor(task)
     def reset(self) -> Observation:
         if self.task is None:
             reward = 0.2  # dense bonus for writing
             quality_score = grade_comment(self.agent_comment, self.expected_keywords, self.expert_comment)
             reward += quality_score
+            # Simulate author response
+            author_response = self.author.respond(self.agent_comment)
+            self.comments.append(f"Agent: {self.agent_comment}")
+            self.comments.append(f"Author: {author_response}")
             self.done = True
         elif action.action_type == "ask_question":
                 reward = -0.1
             else:
                 q_score = grade_question(action.question)
+                reward = 0.1 + q_score
+                # Get answer from simulated author
+                answer = self.author.respond(agent_question=action.question)
                 self.comments.append(f"Agent: {action.question}")
+                self.comments.append(f"Author: {answer}")
                 self.step_count += 1
                 # Episode continues, not done
             if not action.fix_code:
                 reward = -0.2
             else:
+                # Run CI tests
+                test_score = run_unit_tests(action.fix_code, self.task)
+                # Also keyword match for partial credit
+                kw_score = grade_fix(action.fix_code, self.expected_fix_keywords, None)
+                # Combined score: 70% tests, 30% keywords
+                combined_score = 0.7 * test_score + 0.3 * kw_score
+                reward = 0.3 + combined_score
+                self.test_results = f"CI tests passed: {test_score:.0%}, Keywords: {kw_score:.0%}"
             self.done = True
         elif action.action_type == "skip":
         obs = self._get_observation()
         return obs, Reward(value=reward), self.done, info
     def _get_observation(self) -> Observation:
         return Observation(
             pr_title=self.pr_title,