aws_rl_env / tests /test_task_grader.py
Sizzing's picture
Upload folder using huggingface_hub
0f8f2c1 verified
"""Unit tests for TaskGrader — tests all grading strategies and reward shaping.
These tests mock AwsBackend/ResourceVerifier so they run without MiniStack.
Run:
uv run pytest tests/test_task_grader.py -v
docker exec <container> python -m pytest env/tests/test_task_grader.py -v
"""
from unittest.mock import MagicMock, patch
import pytest
from models import (
SuccessCriteria,
Task,
TaskID,
TaskDifficulty,
ResourceExistsCheck,
StepCriteria,
StateCheck,
)
from server.services.task_grader import TaskGrader
from server.services.episode_tracker import EpisodeTracker, StepRecord
# ---------------------------------------------------------------------------
# Fixtures
# ---------------------------------------------------------------------------
@pytest.fixture
def mock_backend() -> MagicMock:
return MagicMock()
@pytest.fixture
def grader(mock_backend: MagicMock) -> TaskGrader:
return TaskGrader(mock_backend)
@pytest.fixture
def tracker() -> EpisodeTracker:
return EpisodeTracker()
def _step(command: str, success: bool = True) -> StepRecord:
return StepRecord(
command=command, success=success, stdout="", stderr="", step_number=0
)
def _task(
criteria: SuccessCriteria, difficulty: TaskDifficulty = TaskDifficulty.WARMUP
) -> Task:
return Task(
task_id=TaskID(999),
difficulty=difficulty,
description="test task",
success_criteria=criteria,
)
# ===================================================================
# _grade_command_match (warmup tier)
# ===================================================================
class TestGradeCommandMatch:
def test_correct_command_achieves(
self, grader: TaskGrader, tracker: EpisodeTracker
) -> None:
criteria = SuccessCriteria(command_contains="s3", operation="ls")
step = _step("aws s3 ls")
tracker.record_step(step.command, step.success, "", "")
result = grader.grade(_task(criteria), tracker, step)
assert result.task_achieved
assert result.reward == 1.0
def test_wrong_service_fails(
self, grader: TaskGrader, tracker: EpisodeTracker
) -> None:
criteria = SuccessCriteria(command_contains="s3", operation="ls")
step = _step("aws ec2 describe-instances")
tracker.record_step(step.command, step.success, "", "")
result = grader.grade(_task(criteria), tracker, step)
assert not result.task_achieved
def test_wrong_operation_fails(
self, grader: TaskGrader, tracker: EpisodeTracker
) -> None:
criteria = SuccessCriteria(command_contains="s3", operation="ls")
step = _step("aws s3 mb s3://bucket")
tracker.record_step(step.command, step.success, "", "")
result = grader.grade(_task(criteria), tracker, step)
assert not result.task_achieved
def test_failed_command_not_achieved(
self, grader: TaskGrader, tracker: EpisodeTracker
) -> None:
criteria = SuccessCriteria(command_contains="s3", operation="ls")
step = _step("aws s3 ls", success=False)
tracker.record_step(step.command, step.success, "", "")
result = grader.grade(_task(criteria), tracker, step)
assert not result.task_achieved
def test_case_insensitive(
self, grader: TaskGrader, tracker: EpisodeTracker
) -> None:
criteria = SuccessCriteria(command_contains="S3", operation="LS")
step = _step("aws s3 ls")
tracker.record_step(step.command, step.success, "", "")
result = grader.grade(_task(criteria), tracker, step)
assert result.task_achieved
# ===================================================================
# _grade_resource_creation (beginner tier)
# ===================================================================
class TestGradeResourceCreation:
def test_resource_exists_achieves(
self, grader: TaskGrader, tracker: EpisodeTracker
) -> None:
criteria = SuccessCriteria(
command_contains="s3api",
operation="create-bucket",
resource_exists=ResourceExistsCheck(service="s3", name="my-bucket"),
)
step = _step("aws s3api create-bucket --bucket my-bucket")
tracker.record_step(step.command, step.success, "", "")
with patch.object(grader._verifier, "resource_exists", return_value=True):
result = grader.grade(
_task(criteria, TaskDifficulty.BEGINNER), tracker, step
)
assert result.task_achieved
assert result.reward == 1.0
assert result.partial_progress == 1.0
def test_resource_missing_but_cmd_ok_gives_partial(
self, grader: TaskGrader, tracker: EpisodeTracker
) -> None:
criteria = SuccessCriteria(
command_contains="s3api",
operation="create-bucket",
resource_exists=ResourceExistsCheck(service="s3", name="my-bucket"),
)
step = _step("aws s3api create-bucket --bucket my-bucket")
tracker.record_step(step.command, step.success, "", "")
with patch.object(grader._verifier, "resource_exists", return_value=False):
result = grader.grade(
_task(criteria, TaskDifficulty.BEGINNER), tracker, step
)
assert not result.task_achieved
assert result.partial_progress == 0.5
def test_wrong_command_and_no_resource_gives_zero(
self, grader: TaskGrader, tracker: EpisodeTracker
) -> None:
criteria = SuccessCriteria(
command_contains="s3api",
operation="create-bucket",
resource_exists=ResourceExistsCheck(service="s3", name="my-bucket"),
)
step = _step("aws sts get-caller-identity")
tracker.record_step(step.command, step.success, "", "")
with patch.object(grader._verifier, "resource_exists", return_value=False):
result = grader.grade(
_task(criteria, TaskDifficulty.BEGINNER), tracker, step
)
assert not result.task_achieved
assert result.partial_progress == 0.0
# ===================================================================
# _grade_multi_step (intermediate/advanced tier)
# ===================================================================
class TestGradeMultiStep:
def test_all_steps_completed_achieves(
self, grader: TaskGrader, tracker: EpisodeTracker
) -> None:
criteria = SuccessCriteria(
steps=[
StepCriteria(operation="create-bucket", resource="data"),
StepCriteria(operation="put-object", resource="data"),
]
)
tracker.record_step("aws s3api create-bucket --bucket data", True, "", "")
step = tracker.record_step(
"aws s3api put-object --bucket data --key f", True, "", ""
)
result = grader.grade(
_task(criteria, TaskDifficulty.INTERMEDIATE), tracker, step
)
assert result.task_achieved
assert result.reward == 1.0
def test_partial_steps_gives_progress(
self, grader: TaskGrader, tracker: EpisodeTracker
) -> None:
criteria = SuccessCriteria(
steps=[
StepCriteria(operation="create-bucket", resource="data"),
StepCriteria(operation="put-object", resource="data"),
]
)
step = tracker.record_step(
"aws s3api create-bucket --bucket data", True, "", ""
)
result = grader.grade(
_task(criteria, TaskDifficulty.INTERMEDIATE), tracker, step
)
assert not result.task_achieved
assert result.partial_progress == 0.5
def test_ordered_stops_at_first_missing(
self, grader: TaskGrader, tracker: EpisodeTracker
) -> None:
criteria = SuccessCriteria(
steps=[
StepCriteria(operation="create-table", resource="orders"),
StepCriteria(operation="put-item", resource="orders"),
StepCriteria(operation="query", resource="orders"),
]
)
# Skip step 2, do step 1 and 3
tracker.record_step(
"aws dynamodb create-table --table-name orders", True, "", ""
)
step = tracker.record_step(
"aws dynamodb query --table-name orders", True, "", ""
)
result = grader.grade(
_task(criteria, TaskDifficulty.INTERMEDIATE), tracker, step
)
assert not result.task_achieved
# Only 1/3 completed because step 2 is missing and ordering is enforced
assert result.partial_progress == pytest.approx(1 / 3)
def test_services_required_must_be_met(
self, grader: TaskGrader, tracker: EpisodeTracker
) -> None:
criteria = SuccessCriteria(
services=["iam", "lambda"],
steps=[
StepCriteria(operation="create-role"),
StepCriteria(operation="create-function", resource="my-fn"),
],
)
tracker.record_step("aws iam create-role --role-name r", True, "", "")
step = tracker.record_step(
"aws lambda create-function --function-name my-fn", True, "", ""
)
result = grader.grade(_task(criteria, TaskDifficulty.ADVANCED), tracker, step)
assert result.task_achieved
def test_missing_service_prevents_achievement(
self, grader: TaskGrader, tracker: EpisodeTracker
) -> None:
criteria = SuccessCriteria(
services=["iam", "lambda", "sqs"],
steps=[
StepCriteria(operation="create-role"),
StepCriteria(operation="create-function", resource="my-fn"),
],
)
tracker.record_step("aws iam create-role --role-name r", True, "", "")
step = tracker.record_step(
"aws lambda create-function --function-name my-fn", True, "", ""
)
result = grader.grade(_task(criteria, TaskDifficulty.ADVANCED), tracker, step)
assert not result.task_achieved # sqs service never used
def test_empty_steps_not_achieved(
self, grader: TaskGrader, tracker: EpisodeTracker
) -> None:
criteria = SuccessCriteria(steps=[])
step = _step("aws s3 ls")
tracker.record_step(step.command, step.success, "", "")
result = grader.grade(
_task(criteria, TaskDifficulty.INTERMEDIATE), tracker, step
)
assert not result.task_achieved
def test_failed_command_not_counted(
self, grader: TaskGrader, tracker: EpisodeTracker
) -> None:
criteria = SuccessCriteria(
steps=[
StepCriteria(operation="create-bucket", resource="data"),
]
)
step = tracker.record_step(
"aws s3api create-bucket --bucket data", False, "", "error"
)
result = grader.grade(
_task(criteria, TaskDifficulty.INTERMEDIATE), tracker, step
)
assert not result.task_achieved
# ===================================================================
# _grade_state_checks (expert tier)
# ===================================================================
class TestGradeStateChecks:
def test_all_checks_pass_achieves(
self, grader: TaskGrader, tracker: EpisodeTracker
) -> None:
criteria = SuccessCriteria(
services=["s3"],
state_checks=[
StateCheck(
command="aws s3api get-bucket-versioning --bucket b",
output_contains="Enabled",
),
],
)
step = tracker.record_step(
"aws s3api put-bucket-versioning --bucket b", True, "", ""
)
with patch.object(grader._verifier, "check_state", return_value=True):
result = grader.grade(_task(criteria, TaskDifficulty.EXPERT), tracker, step)
assert result.task_achieved
def test_failing_check_prevents_achievement(
self, grader: TaskGrader, tracker: EpisodeTracker
) -> None:
criteria = SuccessCriteria(
services=["s3"],
state_checks=[
StateCheck(command="cmd1", output_contains="x"),
StateCheck(command="cmd2", output_contains="y"),
],
)
step = tracker.record_step("aws s3 ls", True, "", "")
with patch.object(grader._verifier, "check_state", side_effect=[True, False]):
result = grader.grade(_task(criteria, TaskDifficulty.EXPERT), tracker, step)
assert not result.task_achieved
assert result.partial_progress > 0 # partial credit for 1/2 checks
def test_services_required_for_state_checks(
self, grader: TaskGrader, tracker: EpisodeTracker
) -> None:
criteria = SuccessCriteria(
services=["s3", "dynamodb"],
state_checks=[
StateCheck(command="cmd1", output_contains="ok"),
],
)
# Only use s3, not dynamodb
step = tracker.record_step("aws s3 ls", True, "", "")
with patch.object(grader._verifier, "check_state", return_value=True):
result = grader.grade(_task(criteria, TaskDifficulty.EXPERT), tracker, step)
assert not result.task_achieved # dynamodb service not used
def test_steps_give_partial_progress(
self, grader: TaskGrader, tracker: EpisodeTracker
) -> None:
criteria = SuccessCriteria(
services=["s3"],
state_checks=[
StateCheck(command="cmd1", output_contains="ok"),
],
steps=[
StepCriteria(operation="create-bucket", resource="b"),
StepCriteria(operation="put-object", resource="b"),
],
)
tracker.record_step("aws s3api create-bucket --bucket b", True, "", "")
step = tracker.record_step(
"aws s3api put-object --bucket b --key k", True, "", ""
)
with patch.object(grader._verifier, "check_state", return_value=True):
result = grader.grade(_task(criteria, TaskDifficulty.EXPERT), tracker, step)
assert result.task_achieved
# Progress: 2/2 steps * 0.7 + 1/1 checks * 0.3 = 1.0
assert result.partial_progress == 1.0
def test_no_state_checks_not_achieved(
self, grader: TaskGrader, tracker: EpisodeTracker
) -> None:
criteria = SuccessCriteria(
services=["s3"],
state_checks=[],
)
step = tracker.record_step("aws s3 ls", True, "", "")
# state_checks dispatch requires non-empty; but empty list means 0 checks
# The grader returns state_checks dispatch with all_checks_pass=False
result = grader.grade(_task(criteria, TaskDifficulty.EXPERT), tracker, step)
# Empty state_checks => no criteria matched => falls through to command_match or empty
assert not result.task_achieved
# ===================================================================
# _compute_reward (reward shaping)
# ===================================================================
class TestComputeReward:
def test_achieved_gives_1_0(
self, grader: TaskGrader, tracker: EpisodeTracker
) -> None:
criteria = SuccessCriteria(command_contains="s3", operation="ls")
step = _step("aws s3 ls")
tracker.record_step(step.command, step.success, "", "")
result = grader.grade(_task(criteria), tracker, step)
assert result.reward == 1.0
def test_chaos_bonus(self, grader: TaskGrader, tracker: EpisodeTracker) -> None:
criteria = SuccessCriteria(command_contains="s3", operation="ls")
step = _step("aws s3 ls")
tracker.record_step(step.command, step.success, "", "")
result = grader.grade(_task(criteria), tracker, step, chaos_occurred=True)
assert result.reward == 1.05
def test_hint_decay_on_achieved(
self, grader: TaskGrader, tracker: EpisodeTracker
) -> None:
criteria = SuccessCriteria(command_contains="s3", operation="ls")
step = _step("aws s3 ls")
tracker.record_step(step.command, step.success, "", "")
result = grader.grade(_task(criteria), tracker, step, hints_used=1)
assert result.reward == pytest.approx(0.85)
def test_hint_decay_on_achieved_stacks(
self, grader: TaskGrader, tracker: EpisodeTracker
) -> None:
criteria = SuccessCriteria(command_contains="s3", operation="ls")
step = _step("aws s3 ls")
tracker.record_step(step.command, step.success, "", "")
result = grader.grade(_task(criteria), tracker, step, hints_used=3)
assert result.reward == pytest.approx(0.85**3)
def test_chaos_plus_hints(
self, grader: TaskGrader, tracker: EpisodeTracker
) -> None:
criteria = SuccessCriteria(command_contains="s3", operation="ls")
step = _step("aws s3 ls")
tracker.record_step(step.command, step.success, "", "")
result = grader.grade(
_task(criteria), tracker, step, chaos_occurred=True, hints_used=2
)
assert result.reward == pytest.approx(1.05 * 0.85**2)
def test_failed_command_halves_reward(
self, grader: TaskGrader, tracker: EpisodeTracker
) -> None:
criteria = SuccessCriteria(command_contains="s3", operation="ls")
step = _step("aws ec2 describe-instances", success=False)
tracker.record_step(step.command, step.success, "", "")
result = grader.grade(_task(criteria), tracker, step)
# Not achieved, no progress, failed command => 0.0 * 0.5 = 0.0
assert result.reward == 0.0
def test_progress_bonus_for_advancing(
self, grader: TaskGrader, tracker: EpisodeTracker
) -> None:
criteria = SuccessCriteria(
steps=[
StepCriteria(operation="create-bucket", resource="b"),
StepCriteria(operation="put-object", resource="b"),
]
)
# First step — progress goes from 0.0 to 0.5
step = tracker.record_step("aws s3api create-bucket --bucket b", True, "", "")
result = grader.grade(
_task(criteria, TaskDifficulty.INTERMEDIATE), tracker, step
)
# partial_progress=0.5, progress_delta > 0 => +0.1 bonus
assert result.reward == pytest.approx(0.5 * 0.8 + 0.1)
def test_no_bonus_for_same_progress(
self, grader: TaskGrader, tracker: EpisodeTracker
) -> None:
criteria = SuccessCriteria(
steps=[
StepCriteria(operation="create-bucket", resource="b"),
StepCriteria(operation="put-object", resource="b"),
]
)
step = tracker.record_step("aws s3api create-bucket --bucket b", True, "", "")
# First grade sets previous_progress
grader.grade(_task(criteria, TaskDifficulty.INTERMEDIATE), tracker, step)
# Second grade with same command — no progress advancement
step2 = tracker.record_step("aws s3api create-bucket --bucket b", True, "", "")
result = grader.grade(
_task(criteria, TaskDifficulty.INTERMEDIATE), tracker, step2
)
# No progress delta bonus
assert result.reward == pytest.approx(0.5 * 0.8)
def test_reward_clamped_below_1(
self, grader: TaskGrader, tracker: EpisodeTracker
) -> None:
criteria = SuccessCriteria(command_contains="xyz", operation="nope")
step = _step("aws s3 ls")
tracker.record_step(step.command, step.success, "", "")
result = grader.grade(_task(criteria), tracker, step)
assert result.reward <= 0.99
def test_rollback_penalty(
self, grader: TaskGrader, tracker: EpisodeTracker
) -> None:
criteria = SuccessCriteria(
steps=[
StepCriteria(operation="create-bucket", resource="b"),
StepCriteria(operation="put-object", resource="b"),
]
)
# Create then delete (rollback)
tracker.record_step("aws s3api create-bucket --bucket b", True, "", "")
tracker.record_step("aws s3api delete-bucket --bucket b", True, "", "")
step = tracker.record_step("aws s3api create-bucket --bucket b", True, "", "")
result = grader.grade(
_task(criteria, TaskDifficulty.INTERMEDIATE), tracker, step
)
# 2 rollbacks detected (both create-bucket commands pair with delete-bucket)
base = 0.5 * 0.8 + 0.1 # progress + delta bonus
expected = base - 0.1 * 2 # 2 rollback penalties
assert result.reward == pytest.approx(expected)
def test_idempotent_retry_bonus(
self, grader: TaskGrader, tracker: EpisodeTracker
) -> None:
criteria = SuccessCriteria(
steps=[
StepCriteria(operation="create-bucket", resource="b"),
StepCriteria(operation="put-object", resource="b"),
]
)
# Failed create with "already exists", then successful next step
tracker.record_step(
"aws s3api create-bucket --bucket b", False, "", "BucketAlreadyOwnedByYou"
)
step = tracker.record_step(
"aws s3api put-object --bucket b --key k", True, "", ""
)
result = grader.grade(
_task(criteria, TaskDifficulty.INTERMEDIATE), tracker, step
)
# Only put-object counted (create-bucket failed), so 0/2 completed (ordered, first fails)
# But idempotent retry gives +0.02
# Actually: step 1 (create-bucket) failed, so has_executed_operation won't find it
# Ordered: stops at step 1 (not found). progress = 0/2 = 0.0
# progress_reward = 0.0 * 0.8 + 0.1 (delta bonus if first time) + 0.02 (idempotent)
# Actually delta: 0.0 - 0.0 = 0, no bonus. Also success=True on latest.
assert result.reward >= 0.0
# ===================================================================
# Dispatch logic
# ===================================================================
class TestDispatch:
def test_state_checks_takes_priority(
self, grader: TaskGrader, tracker: EpisodeTracker
) -> None:
"""state_checks present => uses _grade_state_checks even if steps also present."""
criteria = SuccessCriteria(
services=["s3"],
state_checks=[StateCheck(command="cmd", output_contains="ok")],
steps=[StepCriteria(operation="create-bucket", resource="b")],
)
step = tracker.record_step("aws s3api create-bucket --bucket b", True, "", "")
with patch.object(grader._verifier, "check_state", return_value=True):
result = grader.grade(_task(criteria, TaskDifficulty.EXPERT), tracker, step)
assert "state_checks" in result.reason
def test_steps_over_resource_exists(
self, grader: TaskGrader, tracker: EpisodeTracker
) -> None:
"""steps present => uses _grade_multi_step even if resource_exists also set."""
criteria = SuccessCriteria(
steps=[StepCriteria(operation="create-bucket", resource="b")],
resource_exists=ResourceExistsCheck(service="s3", name="b"),
)
step = tracker.record_step("aws s3api create-bucket --bucket b", True, "", "")
result = grader.grade(
_task(criteria, TaskDifficulty.INTERMEDIATE), tracker, step
)
assert "multi_step" in result.reason
def test_resource_exists_over_command_match(
self, grader: TaskGrader, tracker: EpisodeTracker
) -> None:
"""resource_exists present => uses _grade_resource_creation."""
criteria = SuccessCriteria(
command_contains="s3api",
operation="create-bucket",
resource_exists=ResourceExistsCheck(service="s3", name="b"),
)
step = _step("aws s3api create-bucket --bucket b")
tracker.record_step(step.command, step.success, "", "")
with patch.object(grader._verifier, "resource_exists", return_value=True):
result = grader.grade(
_task(criteria, TaskDifficulty.BEGINNER), tracker, step
)
assert "resource_creation" in result.reason
def test_no_criteria_gives_zero(
self, grader: TaskGrader, tracker: EpisodeTracker
) -> None:
criteria = SuccessCriteria()
step = _step("aws s3 ls")
tracker.record_step(step.command, step.success, "", "")
result = grader.grade(_task(criteria), tracker, step)
assert not result.task_achieved
assert "no recognised" in result.reason
# ===================================================================
# Progress monotonicity
# ===================================================================
class TestProgressMonotonicity:
def test_previous_progress_never_decreases(
self, grader: TaskGrader, tracker: EpisodeTracker
) -> None:
criteria = SuccessCriteria(
steps=[
StepCriteria(operation="create-bucket", resource="b"),
StepCriteria(operation="put-object", resource="b"),
]
)
# Step 1 gives 0.5 progress
step1 = tracker.record_step("aws s3api create-bucket --bucket b", True, "", "")
grader.grade(_task(criteria, TaskDifficulty.INTERMEDIATE), tracker, step1)
assert tracker.previous_progress == 0.5
# Wrong command gives 0.5 progress again (step 2 still incomplete)
step2 = tracker.record_step("aws sts get-caller-identity", True, "", "")
grader.grade(_task(criteria, TaskDifficulty.INTERMEDIATE), tracker, step2)
# previous_progress should NOT decrease
assert tracker.previous_progress == 0.5