Spaces:

Cyber-Machine
/

workflow_arena

Running

App Files Files Community

Cyber-Machine commited on 12 days ago

Commit

aea0016

verified ·

0 Parent(s):

init: WorkFlowArena

Browse files

Files changed (15) hide show

.gitignore +10 -0
__init__.py +34 -0
client.py +52 -0
generator.py +185 -0
models.py +483 -0
openenv.yaml +7 -0
presets.py +96 -0
pyproject.toml +47 -0
server/Dockerfile +80 -0
server/__init__.py +11 -0
server/app.py +90 -0
server/requirements.txt +6 -0
server/ui.py +1270 -0
server/workflow_arena_environment.py +873 -0
uv.lock +0 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,10 @@

+# Python-generated files
+__pycache__/
+*.py[oc]
+build/
+dist/
+wheels/
+*.egg-info
+# Virtual environments
+.venv

__init__.py ADDED Viewed

	@@ -0,0 +1,34 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""WorkflowArena package exports."""
+from workflow_arena.client import WorkflowArenaEnv
+from workflow_arena.generator import generate_episode
+from workflow_arena.models import (
+    DifficultyPreset,
+    EpisodeConfig,
+    TaskStatus,
+    WorkflowActionType,
+    WorkflowArenaAction,
+    WorkflowArenaObservation,
+)
+from workflow_arena.presets import PRESET_CONFIGS, get_preset_config
+from workflow_arena.server.workflow_arena_environment import WorkflowArenaEnvironment
+__all__ = [
+    "DifficultyPreset",
+    "EpisodeConfig",
+    "PRESET_CONFIGS",
+    "TaskStatus",
+    "WorkflowActionType",
+    "WorkflowArenaAction",
+    "WorkflowArenaEnv",
+    "WorkflowArenaEnvironment",
+    "WorkflowArenaObservation",
+    "generate_episode",
+    "get_preset_config",
+]

client.py ADDED Viewed

	@@ -0,0 +1,52 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""WorkflowArena client."""
+from typing import Dict
+from openenv.core import EnvClient
+from openenv.core.client_types import StepResult
+from openenv.core.env_server.types import State
+from workflow_arena.models import WorkflowArenaAction, WorkflowArenaObservation
+class WorkflowArenaEnv(
+    EnvClient[WorkflowArenaAction, WorkflowArenaObservation, State]
+):
+    """Typed client for the WorkflowArena server."""
+    def _step_payload(self, action: WorkflowArenaAction) -> Dict:
+        """Convert a typed action into the JSON payload expected by the server."""
+        return action.model_dump(mode="json")
+    def _parse_result(self, payload: Dict) -> StepResult[WorkflowArenaObservation]:
+        """Parse a server response into a typed observation wrapper."""
+        obs_data = payload.get("observation", {})
+        observation = WorkflowArenaObservation.model_validate(
+            {
+                **obs_data,
+                "done": payload.get("done", False),
+                "reward": payload.get("reward"),
+            }
+        )
+        return StepResult(
+            observation=observation,
+            reward=payload.get("reward"),
+            done=payload.get("done", False),
+        )
+    def _parse_state(self, payload: Dict) -> State:
+        """Parse server state response into the generic OpenEnv state type."""
+        return State(
+            episode_id=payload.get("episode_id"),
+            step_count=payload.get("step_count", 0),
+        )

generator.py ADDED Viewed

	@@ -0,0 +1,185 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""Seeded workflow DAG generator and derived static metrics for WorkflowArena."""
+from __future__ import annotations
+import random
+from workflow_arena.models import (
+    EpisodeConfig,
+    TaskStatus,
+    WorkflowEnvStateSnapshot,
+    WorkflowEpisodeSpec,
+    WorkflowTaskSpec,
+)
+from workflow_arena.presets import get_preset_config
+def _task_id(index: int) -> str:
+    return f"task_{index:02d}"
+def _compute_earliest_start(task_map: dict[str, WorkflowTaskSpec], task_id: str) -> int:
+    task = task_map[task_id]
+    if not task.dependencies:
+        return 0
+    return max(
+        _compute_earliest_start(task_map, dep_id) + task_map[dep_id].duration
+        for dep_id in task.dependencies
+    )
+def _compute_critical_path(task_map: dict[str, WorkflowTaskSpec], task_id: str) -> int:
+    task = task_map[task_id]
+    if not task.dependents:
+        return task.duration
+    return task.duration + max(
+        _compute_critical_path(task_map, child_id) for child_id in task.dependents
+    )
+def _compute_downstream_count(
+    task_map: dict[str, WorkflowTaskSpec], task_id: str, seen: set[str] | None = None
+) -> int:
+    task = task_map[task_id]
+    local_seen = set() if seen is None else seen
+    count = 0
+    for child_id in task.dependents:
+        if child_id in local_seen:
+            continue
+        local_seen.add(child_id)
+        count += 1 + _compute_downstream_count(task_map, child_id, local_seen)
+    return count
+def _estimate_deadline(
+    task: WorkflowTaskSpec,
+    workflow_critical_path: int,
+    rng: random.Random,
+    tightness: float,
+) -> int:
+    slack_allowance = max(1, int(round((workflow_critical_path - task.earliest_start) * (1.15 - tightness))))
+    jitter = rng.randint(0, max(1, task.duration // 2))
+    return task.earliest_start + task.duration + slack_allowance + jitter
+def generate_episode(
+    config: EpisodeConfig,
+) -> tuple[WorkflowEpisodeSpec, WorkflowEnvStateSnapshot]:
+    """Generate a deterministic workflow episode from a preset and seed."""
+    preset_config = get_preset_config(config.preset)
+    worker_count = config.worker_count or preset_config.worker_count
+    resolved_config = config.model_copy(update={"worker_count": worker_count})
+    rng = random.Random(resolved_config.seed)
+    task_count = rng.randint(preset_config.min_tasks, preset_config.max_tasks)
+    dependency_map: dict[str, list[str]] = {}
+    dependent_map: dict[str, list[str]] = {}
+    task_ids = [_task_id(index + 1) for index in range(task_count)]
+    for index, task_id in enumerate(task_ids):
+        candidates = task_ids[:index]
+        dependencies: list[str] = []
+        if candidates:
+            for candidate in candidates:
+                if rng.random() < preset_config.edge_probability:
+                    dependencies.append(candidate)
+            if not dependencies and index > 0 and rng.random() < 0.6:
+                dependencies.append(rng.choice(candidates))
+        dependency_map[task_id] = sorted(set(dependencies), key=task_ids.index)
+        dependent_map[task_id] = []
+    for task_id, dependencies in dependency_map.items():
+        for dependency in dependencies:
+            dependent_map[dependency].append(task_id)
+    tasks = [
+        WorkflowTaskSpec(
+            task_id=task_id,
+            duration=rng.randint(preset_config.duration_min, preset_config.duration_max),
+            priority=rng.randint(preset_config.priority_min, preset_config.priority_max),
+            dependencies=dependency_map[task_id],
+            dependents=sorted(dependent_map[task_id], key=task_ids.index),
+            deadline=None,
+        )
+        for task_id in task_ids
+    ]
+    task_map = {task.task_id: task for task in tasks}
+    workflow_critical_path = 0
+    for task in tasks:
+        task.earliest_start = _compute_earliest_start(task_map, task.task_id)
+        task.critical_path_length = _compute_critical_path(task_map, task.task_id)
+        task.downstream_count = _compute_downstream_count(task_map, task.task_id)
+        workflow_critical_path = max(
+            workflow_critical_path, task.earliest_start + task.duration
+        )
+    workflow_critical_path = max(
+        workflow_critical_path,
+        max(task.critical_path_length for task in tasks),
+    )
+    max_downstream = max(task.downstream_count for task in tasks) if tasks else 1
+    max_critical_path = max(task.critical_path_length for task in tasks) if tasks else 1
+    for task in tasks:
+        latest_start = max(
+            task.earliest_start, workflow_critical_path - task.critical_path_length
+        )
+        task.slack = max(0, latest_start - task.earliest_start)
+        task.criticality = round(
+            0.7 * (task.critical_path_length / max_critical_path)
+            + 0.3 * (task.downstream_count / max(1, max_downstream)),
+            4,
+        )
+        task.deadline = _estimate_deadline(
+            task=task,
+            workflow_critical_path=workflow_critical_path,
+            rng=rng,
+            tightness=preset_config.deadline_tightness,
+        )
+    episode = WorkflowEpisodeSpec(
+        config=resolved_config,
+        preset_config=preset_config,
+        tasks=tasks,
+    )
+    ready_task_ids = [task.task_id for task in tasks if not task.dependencies]
+    blocked_task_ids = [task.task_id for task in tasks if task.dependencies]
+    state = WorkflowEnvStateSnapshot(
+        episode_id=f"seed-{resolved_config.seed}",
+        current_time=0,
+        task_statuses={
+            task.task_id: (
+                TaskStatus.READY if not task.dependencies else TaskStatus.BLOCKED
+            )
+            for task in tasks
+        },
+        running_task_ids=[],
+        completed_task_ids=[],
+        ready_task_ids=ready_task_ids,
+        blocked_task_ids=blocked_task_ids,
+        task_start_times={},
+        task_end_times={},
+        task_remaining_dependencies={
+            task.task_id: len(task.dependencies) for task in tasks
+        },
+        task_assigned_finish_times={},
+        task_attempt_counts={task.task_id: 0 for task in tasks},
+        cumulative_busy_time=0,
+        time_budget=None,
+        degraded_workers=0,
+        active_worker_outage_until=None,
+        recent_failure_events=[],
+    )
+    return episode, state

models.py ADDED Viewed

	@@ -0,0 +1,483 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Typed models for WorkflowArena.
+"""
+from __future__ import annotations
+from enum import Enum
+from openenv.core.env_server.types import Action, Observation
+from pydantic import BaseModel, Field
+class TaskStatus(str, Enum):
+    """Allowed lifecycle states for a workflow task."""
+    BLOCKED = "blocked"
+    READY = "ready"
+    RUNNING = "running"
+    COMPLETED = "completed"
+class DifficultyPreset(str, Enum):
+    """Initial task presets required by the hackathon."""
+    EASY = "easy"
+    MEDIUM = "medium"
+    HARD = "hard"
+class WorkflowActionType(str, Enum):
+    """Explicit action space for the scheduler agent."""
+    DISPATCH = "dispatch"
+    WAIT = "wait"
+class RewardBreakdown(BaseModel):
+    """Named reward channels for shaped feedback."""
+    completion_reward: float = Field(
+        default=0.0, description="Reward for completing tasks."
+    )
+    utilization_reward: float = Field(
+        default=0.0, description="Reward for keeping workers busy."
+    )
+    deadline_reward: float = Field(
+        default=0.0, description="Reward or penalty tied to deadlines."
+    )
+    criticality_reward: float = Field(
+        default=0.0,
+        description="Reward for prioritizing critical-path work appropriately.",
+    )
+    idle_penalty: float = Field(
+        default=0.0, description="Penalty for leaving workers idle."
+    )
+    invalid_action_penalty: float = Field(
+        default=0.0,
+        description="Penalty for malformed or infeasible actions.",
+    )
+    terminal_makespan_score: float = Field(
+        default=0.0,
+        description="Terminal score based on final schedule quality.",
+    )
+    unfinished_task_penalty: float = Field(
+        default=0.0,
+        description="Terminal penalty for unfinished work at episode end.",
+    )
+class FailureEventType(str, Enum):
+    """Failure events surfaced to agents and the UI."""
+    WORKER_OUTAGE_START = "worker_outage_start"
+    WORKER_OUTAGE_END = "worker_outage_end"
+    TASK_RETRY_FAILURE = "task_retry_failure"
+class WorkflowFailureEvent(BaseModel):
+    """Structured failure event emitted by the environment."""
+    event_type: FailureEventType = Field(..., description="Failure category.")
+    time: int = Field(..., ge=0, description="Simulated time when the event was observed.")
+    task_id: str | None = Field(default=None, description="Task affected by the event, if any.")
+    worker_delta: int = Field(default=0, description="Net temporary change in usable workers.")
+    duration: int | None = Field(default=None, ge=0, description="Outage duration when applicable.")
+    detail: str = Field(default="", description="Short human-readable summary.")
+class WorkflowTaskView(BaseModel):
+    """Compact task payload used in observations and the future UI."""
+    task_id: str = Field(..., description="Stable task identifier.")
+    status: TaskStatus = Field(..., description="Current task lifecycle state.")
+    duration: int = Field(
+        ..., ge=1, description="Task runtime in simulated time units."
+    )
+    priority: int = Field(..., ge=0, description="Priority weight for the task.")
+    dependencies: list[str] = Field(
+        default_factory=list,
+        description="Upstream task ids that must complete first.",
+    )
+    deadline: int | None = Field(
+        default=None,
+        ge=0,
+        description="Optional deadline in simulated time units.",
+    )
+    criticality: float | None = Field(
+        default=None,
+        description="Derived importance score from the DAG structure.",
+    )
+    slack: float | None = Field(
+        default=None,
+        description="Derived slack estimate for scheduling decisions.",
+    )
+    downstream_count: int = Field(
+        default=0,
+        ge=0,
+        description="Count of downstream dependents reachable from this task.",
+    )
+    start_time: int | None = Field(
+        default=None,
+        ge=0,
+        description="Simulated start time if the task is running or completed.",
+    )
+    end_time: int | None = Field(
+        default=None,
+        ge=0,
+        description="Simulated end time if the task is completed or scheduled to finish.",
+    )
+    attempt_count: int = Field(
+        default=0,
+        ge=0,
+        description="Number of retry attempts already consumed by this task.",
+    )
+class WorkflowTaskSpec(BaseModel):
+    """Static task specification generated at episode reset."""
+    task_id: str = Field(..., description="Stable task identifier.")
+    duration: int = Field(..., ge=1, description="Task runtime in simulated time units.")
+    priority: int = Field(..., ge=0, description="Priority weight for the task.")
+    dependencies: list[str] = Field(
+        default_factory=list,
+        description="Upstream task ids that must complete first.",
+    )
+    dependents: list[str] = Field(
+        default_factory=list,
+        description="Downstream task ids that depend on this task.",
+    )
+    deadline: int | None = Field(
+        default=None,
+        ge=0,
+        description="Optional deadline in simulated time units.",
+    )
+    downstream_count: int = Field(
+        default=0,
+        ge=0,
+        description="Number of downstream tasks reachable from this node.",
+    )
+    critical_path_length: int = Field(
+        default=0,
+        ge=0,
+        description="Duration-weighted path length from this task to a sink.",
+    )
+    earliest_start: int = Field(
+        default=0,
+        ge=0,
+        description="Earliest feasible start time under dependency constraints.",
+    )
+    slack: int = Field(
+        default=0,
+        ge=0,
+        description="Scheduling slack measured in simulated time units.",
+    )
+    criticality: float = Field(
+        default=0.0,
+        description="Normalized importance score derived from critical path and downstream impact.",
+    )
+class ProgressSummary(BaseModel):
+    """Counts by task lifecycle state."""
+    total: int = Field(default=0, ge=0)
+    blocked: int = Field(default=0, ge=0)
+    ready: int = Field(default=0, ge=0)
+    running: int = Field(default=0, ge=0)
+    completed: int = Field(default=0, ge=0)
+class EpisodeConfig(BaseModel):
+    """Reset-time knobs that define the episode."""
+    preset: DifficultyPreset = Field(
+        default=DifficultyPreset.EASY,
+        description="Difficulty preset for the episode generator.",
+    )
+    seed: int = Field(
+        default=0, description="Seed for deterministic episode generation."
+    )
+    worker_count: int = Field(
+        default=2,
+        ge=1,
+        description="Number of identical workers available to the scheduler.",
+    )
+class GraderTarget(BaseModel):
+    """High-level target bands for each preset's grader."""
+    description: str = Field(..., description="What good performance means for the preset.")
+    score_band_hint: str = Field(..., description="Human-readable interpretation of scores.")
+class DifficultyPresetConfig(BaseModel):
+    """Concrete generator knobs for a preset."""
+    preset: DifficultyPreset = Field(..., description="Preset identifier.")
+    min_tasks: int = Field(..., ge=2)
+    max_tasks: int = Field(..., ge=2)
+    edge_probability: float = Field(..., ge=0.0, le=1.0)
+    duration_min: int = Field(..., ge=1)
+    duration_max: int = Field(..., ge=1)
+    priority_min: int = Field(..., ge=0)
+    priority_max: int = Field(..., ge=0)
+    worker_count: int = Field(..., ge=1)
+    deadline_tightness: float = Field(
+        ...,
+        ge=0.0,
+        description="Larger values mean tighter deadlines.",
+    )
+    time_budget_multiplier: float | None = Field(
+        default=None,
+        gt=0.0,
+        description="Optional multiplier over the theoretical lower-bound makespan.",
+    )
+    worker_outage_rate: float = Field(
+        default=0.0,
+        ge=0.0,
+        le=1.0,
+        description="Chance of a hard-mode worker outage being sampled on a wait transition.",
+    )
+    worker_outage_duration_min: int = Field(
+        default=0,
+        ge=0,
+        description="Minimum outage duration in simulated time units.",
+    )
+    worker_outage_duration_max: int = Field(
+        default=0,
+        ge=0,
+        description="Maximum outage duration in simulated time units.",
+    )
+    task_retry_failure_rate: float = Field(
+        default=0.0,
+        ge=0.0,
+        le=1.0,
+        description="Chance that a hard-mode task completion becomes a retry failure.",
+    )
+    max_task_retries: int = Field(
+        default=0,
+        ge=0,
+        description="Maximum number of retry failures a task may suffer before it must complete.",
+    )
+    grader_target: GraderTarget = Field(
+        ...,
+        description="Preset-specific grader interpretation.",
+    )
+class WorkflowEpisodeSpec(BaseModel):
+    """Static episode description produced by the generator."""
+    config: EpisodeConfig = Field(..., description="Reset-time configuration.")
+    preset_config: DifficultyPresetConfig = Field(..., description="Resolved preset parameters.")
+    tasks: list[WorkflowTaskSpec] = Field(..., description="Generated workflow tasks.")
+class WorkflowEnvStateSnapshot(BaseModel):
+    """Serializable environment state for the current episode."""
+    episode_id: str = Field(..., description="Stable current episode identifier.")
+    current_time: int = Field(default=0, ge=0, description="Current simulated time.")
+    task_statuses: dict[str, TaskStatus] = Field(
+        default_factory=dict,
+        description="Current task status by task id.",
+    )
+    running_task_ids: list[str] = Field(
+        default_factory=list,
+        description="Tasks currently consuming workers.",
+    )
+    completed_task_ids: list[str] = Field(
+        default_factory=list,
+        description="Tasks that have completed.",
+    )
+    ready_task_ids: list[str] = Field(
+        default_factory=list,
+        description="Tasks currently ready for dispatch.",
+    )
+    blocked_task_ids: list[str] = Field(
+        default_factory=list,
+        description="Tasks still blocked on dependencies.",
+    )
+    task_start_times: dict[str, int] = Field(
+        default_factory=dict,
+        description="Simulated start time by task id.",
+    )
+    task_end_times: dict[str, int] = Field(
+        default_factory=dict,
+        description="Simulated completion time by task id.",
+    )
+    task_remaining_dependencies: dict[str, int] = Field(
+        default_factory=dict,
+        description="Remaining unfinished prerequisites by task id.",
+    )
+    task_assigned_finish_times: dict[str, int] = Field(
+        default_factory=dict,
+        description="Predicted completion times for currently running tasks.",
+    )
+    task_attempt_counts: dict[str, int] = Field(
+        default_factory=dict,
+        description="Retry attempts consumed by each task.",
+    )
+    cumulative_busy_time: int = Field(
+        default=0,
+        ge=0,
+        description="Aggregate worker busy time accrued so far.",
+    )
+    time_budget: int | None = Field(
+        default=None,
+        ge=0,
+        description="Optional terminal time budget for the episode.",
+    )
+    degraded_workers: int = Field(
+        default=0,
+        ge=0,
+        description="Workers temporarily removed from usable capacity.",
+    )
+    active_worker_outage_until: int | None = Field(
+        default=None,
+        ge=0,
+        description="Time when the current worker outage expires, if any.",
+    )
+    recent_failure_events: list[WorkflowFailureEvent] = Field(
+        default_factory=list,
+        description="Failure events generated on the latest transition.",
+    )
+class SuccessMetrics(BaseModel):
+    """Primary quality metrics used for grading and demos."""
+    makespan: int | None = Field(
+        default=None, description="Total simulated completion time."
+    )
+    worker_utilization: float | None = Field(
+        default=None,
+        description="Fraction of available worker time that was used.",
+    )
+    deadline_miss_count: int = Field(
+        default=0, ge=0, description="Missed task deadlines."
+    )
+    unfinished_task_count: int = Field(
+        default=0, ge=0, description="Tasks left incomplete at terminal time."
+    )
+    weighted_priority_completion: float | None = Field(
+        default=None,
+        description="Priority-weighted on-time completion score.",
+    )
+    benchmark_score: float | None = Field(
+        default=None,
+        description="Deterministic terminal benchmark score in the 0.0-1.0 range.",
+    )
+class WorkflowArenaAction(Action):
+    """Strict action space for the workflow scheduler."""
+    action_type: WorkflowActionType = Field(
+        ...,
+        description="Dispatch ready tasks or wait for the next completion event.",
+    )
+    task_ids: list[str] = Field(
+        default_factory=list,
+        description="Task ids to dispatch. Must be empty for wait().",
+    )
+class WorkflowArenaObservation(Observation):
+    """Compact, typed observation contract for WorkflowArena."""
+    instruction: str = Field(
+        default=(
+            "Schedule dependency-constrained workflow tasks on limited workers using "
+            "dispatch(task_ids=[...]) or wait()."
+        ),
+        description="Short prompt shown to inference agents.",
+    )
+    config: EpisodeConfig = Field(
+        default_factory=EpisodeConfig,
+        description="Episode generation settings.",
+    )
+    current_time: int = Field(default=0, ge=0, description="Current simulated time.")
+    total_workers: int = Field(default=2, ge=1, description="Total identical workers.")
+    effective_workers: int = Field(
+        default=2,
+        ge=0,
+        description="Usable workers after temporary degradation is applied.",
+    )
+    degraded_workers: int = Field(
+        default=0,
+        ge=0,
+        description="Workers currently unavailable due to outages.",
+    )
+    free_workers: int = Field(default=2, ge=0, description="Currently idle workers.")
+    time_budget: int | None = Field(
+        default=None,
+        ge=0,
+        description="Optional terminal time budget for the current episode.",
+    )
+    time_remaining: int | None = Field(
+        default=None,
+        description="Remaining time until the episode budget expires, if budgeted.",
+    )
+    progress: ProgressSummary = Field(
+        default_factory=ProgressSummary,
+        description="Task counts by lifecycle state.",
+    )
+    ready_tasks: list[WorkflowTaskView] = Field(
+        default_factory=list,
+        description="Ready tasks eligible for dispatch.",
+    )
+    running_tasks: list[WorkflowTaskView] = Field(
+        default_factory=list,
+        description="Tasks currently consuming workers.",
+    )
+    completed_tasks: list[WorkflowTaskView] = Field(
+        default_factory=list,
+        description="Tasks already completed.",
+    )
+    blocked_tasks: list[WorkflowTaskView] = Field(
+        default_factory=list,
+        description="Tasks still waiting on dependencies.",
+    )
+    last_reward_breakdown: RewardBreakdown = Field(
+        default_factory=RewardBreakdown,
+        description="Per-step reward channel breakdown.",
+    )
+    cumulative_reward: float = Field(default=0.0, description="Running total reward.")
+    success_metrics: SuccessMetrics = Field(
+        default_factory=SuccessMetrics,
+        description="Primary schedule quality metrics.",
+    )
+    note: str | None = Field(
+        default=None,
+        description="Short environment note about the latest transition.",
+    )
+    validation_error: str | None = Field(
+        default=None,
+        description="Explicit invalid-action explanation when the previous action failed.",
+    )
+    termination_reason: str | None = Field(
+        default=None,
+        description="Terminal reason when the episode ended unsuccessfully.",
+    )
+    benchmark_score: float | None = Field(
+        default=None,
+        description="Top-level bounded benchmark score for easier client access.",
+    )
+    recent_failure_events: list[WorkflowFailureEvent] = Field(
+        default_factory=list,
+        description="Failure events generated on the latest accepted transition.",
+    )
+    received_action: dict[str, object] | None = Field(
+        default=None,
+        description="Last action accepted by the server for logging and prompting.",
+    )

openenv.yaml ADDED Viewed

	@@ -0,0 +1,7 @@

+spec_version: 1
+name: workflow_arena
+type: space
+runtime: fastapi
+app: server.app:app
+port: 8000

presets.py ADDED Viewed

	@@ -0,0 +1,96 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""Difficulty presets for WorkflowArena."""
+from __future__ import annotations
+from workflow_arena.models import DifficultyPreset, DifficultyPresetConfig, GraderTarget
+PRESET_CONFIGS: dict[DifficultyPreset, DifficultyPresetConfig] = {
+    DifficultyPreset.EASY: DifficultyPresetConfig(
+        preset=DifficultyPreset.EASY,
+        min_tasks=8,
+        max_tasks=12,
+        edge_probability=0.14,
+        duration_min=1,
+        duration_max=4,
+        priority_min=1,
+        priority_max=4,
+        worker_count=3,
+        deadline_tightness=0.22,
+        time_budget_multiplier=None,
+        worker_outage_rate=0.0,
+        worker_outage_duration_min=0,
+        worker_outage_duration_max=0,
+        task_retry_failure_rate=0.0,
+        max_task_retries=0,
+        grader_target=GraderTarget(
+            description=(
+                "Reward agents that keep workers utilized and avoid obvious idle time on a "
+                "small, low-pressure workflow."
+            ),
+            score_band_hint="0.8+ means near-greedy scheduling, 0.5 is acceptable, below 0.3 is weak.",
+        ),
+    ),
+    DifficultyPreset.MEDIUM: DifficultyPresetConfig(
+        preset=DifficultyPreset.MEDIUM,
+        min_tasks=12,
+        max_tasks=18,
+        edge_probability=0.22,
+        duration_min=1,
+        duration_max=6,
+        priority_min=1,
+        priority_max=6,
+        worker_count=4,
+        deadline_tightness=0.40,
+        time_budget_multiplier=1.6,
+        worker_outage_rate=0.0,
+        worker_outage_duration_min=0,
+        worker_outage_duration_max=0,
+        task_retry_failure_rate=0.0,
+        max_task_retries=0,
+        grader_target=GraderTarget(
+            description=(
+                "Reward agents that balance utilization, deadline adherence, and critical-path "
+                "awareness on a moderately branching workflow."
+            ),
+            score_band_hint="0.75+ is strong, 0.45 to 0.75 is competitive, below 0.3 misses core tradeoffs.",
+        ),
+    ),
+    DifficultyPreset.HARD: DifficultyPresetConfig(
+        preset=DifficultyPreset.HARD,
+        min_tasks=22,
+        max_tasks=36,
+        edge_probability=0.37,
+        duration_min=2,
+        duration_max=9,
+        priority_min=1,
+        priority_max=8,
+        worker_count=2,
+        deadline_tightness=0.78,
+        time_budget_multiplier=1.45,
+        worker_outage_rate=0.2,
+        worker_outage_duration_min=2,
+        worker_outage_duration_max=4,
+        task_retry_failure_rate=0.12,
+        max_task_retries=1,
+        grader_target=GraderTarget(
+            description=(
+                "Reward agents that identify and schedule long-running critical tasks early while "
+                "protecting high-priority deadlines under frequent worker-capacity bottlenecks."
+            ),
+            score_band_hint="0.7+ is excellent, 0.4 to 0.7 is competent, below 0.25 is poor planning.",
+        ),
+    ),
+}
+def get_preset_config(preset: DifficultyPreset) -> DifficultyPresetConfig:
+    """Return the immutable config for a preset."""
+    return PRESET_CONFIGS[preset].model_copy(deep=True)

pyproject.toml ADDED Viewed

	@@ -0,0 +1,47 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+[build-system]
+requires = ["setuptools>=45", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "openenv-workflow_arena"
+version = "0.1.0"
+description = "Workflow Arena environment for OpenEnv"
+requires-python = ">=3.10"
+dependencies = [
+    # Core OpenEnv runtime (provides FastAPI server + HTTP client types)
+    # install from github
+    # "openenv-core[core] @ git+https://github.com/meta-pytorch/OpenEnv.git",
+    "openenv-core[core]>=0.2.2",
+    "gradio>=5.0.0",
+    "plotly>=5.24.0",
+    # Environment-specific dependencies
+    # Add all dependencies needed for your environment here
+    # Examples:
+    # "numpy>=1.19.0",
+    # "torch>=2.0.0",
+    # "gymnasium>=0.29.0",
+    # "openspiel>=1.0.0",
+    # "smolagents>=1.22.0,<2",
+]
+[project.optional-dependencies]
+dev = [
+    "pytest>=8.0.0",
+    "pytest-cov>=4.0.0",
+]
+[project.scripts]
+# Server entry point - enables running via: uv run --project . server
+# or: python -m workflow_arena.server.app
+server = "workflow_arena.server.app:main"
+[tool.setuptools]
+include-package-data = true
+packages = ["workflow_arena", "workflow_arena.server"]
+package-dir = { "workflow_arena" = ".", "workflow_arena.server" = "server" }

server/Dockerfile ADDED Viewed

	@@ -0,0 +1,80 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# Multi-stage build using openenv-base
+# This Dockerfile is flexible and works for both:
+# - In-repo environments (with local OpenEnv sources)
+# - Standalone environments (with openenv from PyPI/Git)
+# The build script (openenv build) handles context detection and sets appropriate build args.
+ARG BASE_IMAGE=ghcr.io/meta-pytorch/openenv-base:latest
+FROM ${BASE_IMAGE} AS builder
+WORKDIR /app
+# Ensure git is available (required for installing dependencies from VCS)
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends git && \
+    rm -rf /var/lib/apt/lists/*
+# Build argument to control whether we're building standalone or in-repo
+ARG BUILD_MODE=in-repo
+ARG ENV_NAME=workflow_arena
+# Copy environment code (always at root of build context)
+COPY . /app/env
+# For in-repo builds, openenv is already vendored in the build context
+# For standalone builds, openenv will be installed via pyproject.toml
+WORKDIR /app/env
+# Ensure uv is available (for local builds where base image lacks it)
+RUN if ! command -v uv >/dev/null 2>&1; then \
+        curl -LsSf https://astral.sh/uv/install.sh | sh && \
+        mv /root/.local/bin/uv /usr/local/bin/uv && \
+        mv /root/.local/bin/uvx /usr/local/bin/uvx; \
+    fi
+# Install dependencies using uv sync
+# If uv.lock exists, use it; otherwise resolve on the fly
+RUN --mount=type=cache,target=/root/.cache/uv \
+    if [ -f uv.lock ]; then \
+        uv sync --frozen --no-install-project --no-editable; \
+    else \
+        uv sync --no-install-project --no-editable; \
+    fi
+RUN --mount=type=cache,target=/root/.cache/uv \
+    if [ -f uv.lock ]; then \
+        uv sync --frozen --no-editable; \
+    else \
+        uv sync --no-editable; \
+    fi
+# Final runtime stage
+FROM ${BASE_IMAGE}
+WORKDIR /app
+# Copy the virtual environment from builder
+COPY --from=builder /app/env/.venv /app/.venv
+# Copy the environment code
+COPY --from=builder /app/env /app/env
+# Set PATH to use the virtual environment
+ENV PATH="/app/.venv/bin:$PATH"
+# Set PYTHONPATH so imports work correctly
+ENV PYTHONPATH="/app/env:$PYTHONPATH"
+# Health check
+HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
+    CMD curl -f http://localhost:8000/health || exit 1
+# Run the FastAPI server
+# The module path is constructed to work with the /app/env structure
+CMD ["sh", "-c", "cd /app/env && uvicorn server.app:app --host 0.0.0.0 --port 8000"]

server/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""Workflow Arena environment server components."""
+from workflow_arena.server.workflow_arena_environment import WorkflowArenaEnvironment
+__all__ = ["WorkflowArenaEnvironment"]

server/app.py ADDED Viewed

	@@ -0,0 +1,90 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+FastAPI application for the Workflow Arena Environment.
+This module creates an HTTP server that exposes the WorkflowArenaEnvironment
+over HTTP and WebSocket endpoints, compatible with EnvClient.
+Endpoints:
+    - POST /reset: Reset the environment
+    - POST /step: Execute an action
+    - GET /state: Get current environment state
+    - GET /schema: Get action/observation schemas
+    - WS /ws: WebSocket endpoint for persistent sessions
+Usage:
+    # Development (with auto-reload):
+    uvicorn server.app:app --reload --host 0.0.0.0 --port 8000
+    # Production:
+    uvicorn server.app:app --host 0.0.0.0 --port 8000 --workers 4
+    # Or run directly:
+    python -m server.app
+"""
+import gradio as gr
+try:
+    from openenv.core.env_server.http_server import create_app
+except Exception as e:  # pragma: no cover
+    raise ImportError(
+        "openenv is required for the web interface. Install dependencies with '\n    uv sync\n'"
+    ) from e
+from workflow_arena.models import WorkflowArenaAction, WorkflowArenaObservation
+from workflow_arena.server.ui import create_gradio_app
+from workflow_arena.server.workflow_arena_environment import WorkflowArenaEnvironment
+# Create the app with web interface and README integration
+app = create_app(
+    WorkflowArenaEnvironment,
+    WorkflowArenaAction,
+    WorkflowArenaObservation,
+    env_name="workflow_arena",
+    max_concurrent_envs=1,  # increase this number to allow more concurrent WebSocket sessions
+)
+# Mount Gradio UI at root — MUST be after all API routes to avoid catchall interference
+_gradio_app = create_gradio_app()
+app = gr.mount_gradio_app(app, _gradio_app, path="/")
+def main(host: str = "0.0.0.0", port: int = 8000):
+    """
+    Entry point for direct execution via uv run or python -m.
+    This function enables running the server without Docker:
+        uv run --project . server
+        uv run --project . server --port 8001
+        python -m workflow_arena.server.app
+    Args:
+        host: Host address to bind to (default: "0.0.0.0")
+        port: Port number to listen on (default: 8000)
+    For production deployments, consider using uvicorn directly with
+    multiple workers:
+        uvicorn workflow_arena.server.app:app --workers 4
+    """
+    import uvicorn
+    uvicorn.run(app, host=host, port=port)
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--port", type=int, default=8000)
+    args = parser.parse_args()
+    if args.port == 8000:
+        main()
+    else:
+        main(port=args.port)

server/requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+openenv[core]>=0.2.0
+fastapi>=0.115.0
+uvicorn>=0.24.0
+gradio>=5.0.0
+plotly>=5.24.0

server/ui.py ADDED Viewed

	@@ -0,0 +1,1270 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""Interactive Gradio UI for WorkflowArena."""
+from __future__ import annotations
+import random
+from types import SimpleNamespace
+from typing import Any
+import gradio as gr
+import plotly.graph_objects as go
+from workflow_arena.models import DifficultyPreset, TaskStatus, WorkflowActionType, WorkflowArenaAction
+from workflow_arena.presets import get_preset_config
+from workflow_arena.server.workflow_arena_environment import WorkflowArenaEnvironment
+Session = dict[str, Any]
+DETAIL_HEADERS = [
+    "Task",
+    "Priority",
+    "Duration",
+    "Deadline",
+    "Criticality",
+    "Slack",
+    "Deps",
+    "Downstream",
+    "Attempts",
+    "Start",
+    "End",
+]
+PRESET_BRIEFS = {
+    DifficultyPreset.EASY.value: {
+        "label": "Warm-up Flow",
+        "summary": "Small DAG, softer deadlines, and fewer traps. Good for learning how dispatch and wait interact.",
+        "focus": "Keep workers busy, avoid empty waits, and build intuition for parallel batches.",
+        "mechanics": "No hard time budget and no failure events.",
+    },
+    DifficultyPreset.MEDIUM.value: {
+        "label": "Balanced Pressure",
+        "summary": "Tighter dependencies and more timing pressure. Scheduling mistakes start to compound.",
+        "focus": "Balance urgency, downstream unlocks, and worker utilization.",
+        "mechanics": "Adds a fixed time budget and terminal penalty for unfinished work.",
+    },
+    DifficultyPreset.HARD.value: {
+        "label": "Critical Path Sprint",
+        "summary": "Dense DAGs, tighter deadlines, and much less room for idle capacity.",
+        "focus": "Protect the critical path and use every free slot intentionally.",
+        "mechanics": "Adds a tighter time budget plus seeded worker outages and task retry failures.",
+    },
+}
+CSS = """
+.gradio-container {
+  background:
+    radial-gradient(circle at top left, rgba(216, 116, 76, 0.14), transparent 28%),
+    radial-gradient(circle at top right, rgba(201, 157, 92, 0.10), transparent 24%),
+    linear-gradient(180deg, #fbf4ea 0%, #f4e7d6 100%);
+  color: #2d241c;
+  font-family: "IBM Plex Sans", "Avenir Next", "Segoe UI", sans-serif;
+}
+.wa-shell {max-width: 1380px; margin: 0 auto; padding: 10px 10px 30px;}
+.wa-title {margin-bottom: 18px;}
+.wa-title h1 {margin: 0; font-size: 2.6rem; line-height: 1; letter-spacing: -0.04em; color: #3e2618;}
+.wa-title p {margin: 10px 0 0; max-width: 920px; font-size: 1rem; color: #745f50;}
+.wa-hero {display: grid; grid-template-columns: 1.15fr 0.85fr; gap: 18px; margin-bottom: 18px;}
+.wa-card {
+  background: rgba(255, 252, 247, 0.92);
+  border: 1px solid rgba(139, 110, 84, 0.12);
+  border-radius: 24px;
+  box-shadow: 0 18px 60px rgba(114, 84, 51, 0.10);
+  backdrop-filter: blur(16px);
+}
+.wa-control-card {padding: 20px;}
+.wa-control-card h3,
+.wa-panel h3,
+.wa-playbook h3 {
+  margin: 0 0 8px;
+  font-size: 0.8rem;
+  letter-spacing: 0.12em;
+  text-transform: uppercase;
+  color: #9f5b33;
+}
+.wa-control-card p,
+.wa-panel p,
+.wa-playbook p {margin: 0; color: #715e50; line-height: 1.5;}
+.wa-control-grid {display: grid; grid-template-columns: 1fr 1fr 1fr; gap: 12px; align-items: end; margin-top: 14px;}
+.wa-control-buttons {display: flex; gap: 10px; align-items: center; margin-top: 12px;}
+.wa-inline-buttons {display: flex; gap: 10px; align-items: center; flex-wrap: wrap;}
+.wa-compact-accordion {margin-top: 14px;}
+.wa-compact-accordion .label-wrap span {font-size: 0.85rem;}
+.wa-problem-box {
+  padding: 4px 2px 2px;
+  border-radius: 18px;
+}
+.wa-problem-box strong {color: #8d4f2d;}
+.wa-problem-box p {margin: 0 0 8px; color: #6d594c;}
+.wa-problem-box p:last-child {margin-bottom: 0;}
+.wa-preset-card {padding: 20px; min-height: 100%;}
+.wa-preset-card .eyebrow {font-size: 0.75rem; letter-spacing: 0.14em; text-transform: uppercase; color: #b16d3d;}
+.wa-preset-card .name {margin-top: 8px; font-size: 1.7rem; font-weight: 700; letter-spacing: -0.03em; color: #3e2618;}
+.wa-preset-card .summary {margin-top: 10px; color: #6e594a; line-height: 1.55;}
+.wa-preset-meta {margin-top: 12px; color: #7b6554; line-height: 1.5;}
+.wa-preset-focus {
+  margin-top: 14px;
+  padding: 12px 14px;
+  border-radius: 18px;
+  background: linear-gradient(135deg, rgba(222, 143, 93, 0.12), rgba(242, 210, 171, 0.28));
+  border: 1px solid rgba(174, 117, 72, 0.14);
+}
+.wa-topbar {display: grid; grid-template-columns: 1.2fr 1fr 1fr 1fr 1fr 1fr; gap: 12px; margin: 0 0 16px;}
+.wa-stat {
+  background: linear-gradient(180deg, #fff8ef 0%, #f9efe2 100%);
+  border: 1px solid rgba(168, 130, 95, 0.16);
+  border-radius: 20px;
+  padding: 16px 18px;
+}
+.wa-stat .label {font-size: 0.7rem; letter-spacing: 0.12em; text-transform: uppercase; color: #a56c43;}
+.wa-stat .value {margin-top: 6px; font-size: 1.65rem; font-weight: 700; color: #3e2618;}
+.wa-stat .sub {margin-top: 4px; font-size: 0.82rem; color: #7a6657;}
+.wa-banner {
+  border-radius: 24px;
+  padding: 18px 20px;
+  border: 1px solid rgba(177, 107, 70, 0.18);
+  background: linear-gradient(135deg, rgba(245, 186, 144, 0.35), rgba(255, 251, 245, 0.96));
+  color: #35271f;
+  margin-bottom: 16px;
+}
+.wa-banner.invalid {
+  background: linear-gradient(135deg, rgba(247, 202, 196, 0.92), rgba(255, 246, 244, 0.98));
+  border-color: rgba(180, 84, 69, 0.22);
+}
+.wa-banner.done {
+  background: linear-gradient(135deg, rgba(219, 229, 195, 0.9), rgba(255, 251, 244, 0.98));
+  border-color: rgba(112, 139, 90, 0.22);
+}
+.wa-banner-top {display: flex; justify-content: space-between; gap: 16px; align-items: flex-start;}
+.wa-banner .status {
+  display: inline-block;
+  padding: 5px 10px;
+  border-radius: 999px;
+  font-size: 0.72rem;
+  font-weight: 700;
+  letter-spacing: 0.08em;
+  text-transform: uppercase;
+  background: rgba(164, 97, 59, 0.12);
+}
+.wa-banner .meta {margin-top: 8px; font-size: 0.9rem; color: #7f6654;}
+.wa-banner .note {margin-top: 12px; font-size: 1rem; line-height: 1.5; color: #49372c;}
+.wa-banner-grid {display: grid; grid-template-columns: repeat(4, minmax(0, 1fr)); gap: 10px; margin-top: 14px;}
+.wa-banner-metric {
+  padding: 12px 14px;
+  border-radius: 18px;
+  background: rgba(255, 253, 248, 0.76);
+  border: 1px solid rgba(178, 132, 96, 0.12);
+}
+.wa-banner-metric span {display: block; font-size: 0.72rem; letter-spacing: 0.08em; text-transform: uppercase; color: #a16b45;}
+.wa-banner-metric strong {display: block; margin-top: 6px; font-size: 1.1rem; color: #3d281c;}
+.wa-progress {height: 10px; margin-top: 14px; border-radius: 999px; overflow: hidden; background: rgba(120, 83, 54, 0.10);}
+.wa-progress-fill {height: 100%; background: linear-gradient(90deg, #e49157 0%, #f1c27a 100%);}
+.wa-main {display: grid; grid-template-columns: 1.18fr 0.82fr; gap: 18px; align-items: start;}
+.wa-left-stack,
+.wa-right-stack {display: grid; gap: 18px;}
+.wa-panel {padding: 18px 18px 16px;}
+.wa-playbook {padding: 18px;}
+.wa-playbook-header {display: flex; justify-content: space-between; gap: 12px; align-items: center; margin-bottom: 12px;}
+.wa-playbook-title {font-size: 1.4rem; font-weight: 700; letter-spacing: -0.03em; color: #3e2618;}
+.wa-chip-row {display: flex; flex-wrap: wrap; gap: 8px; margin-top: 12px;}
+.wa-chip {
+  display: inline-flex;
+  align-items: center;
+  padding: 7px 10px;
+  border-radius: 999px;
+  background: rgba(172, 121, 80, 0.10);
+  color: #7b4f32;
+  font-size: 0.84rem;
+  font-weight: 600;
+}
+.wa-lane-header {display: flex; justify-content: space-between; gap: 12px; align-items: flex-start; margin-bottom: 8px;}
+.wa-lane-title {font-size: 1.32rem; font-weight: 700; letter-spacing: -0.03em; color: #3e2618;}
+.wa-lane-copy {font-size: 0.96rem; color: #78624f;}
+.wa-hint {
+  margin-bottom: 14px;
+  padding: 12px 14px;
+  border-radius: 18px;
+  background: rgba(174, 126, 88, 0.08);
+  border: 1px solid rgba(174, 126, 88, 0.12);
+  color: #6d4a32;
+}
+.wa-card-grid {display: grid; grid-template-columns: repeat(auto-fit, minmax(235px, 1fr)); gap: 12px;}
+.wa-task-card {
+  background: linear-gradient(180deg, #fffaf2 0%, #f7ecde 100%);
+  border: 1px solid rgba(175, 135, 100, 0.18);
+  border-radius: 22px;
+  padding: 14px;
+  color: #34261d;
+}
+.wa-task-card.running {background: linear-gradient(180deg, #f4ebe0 0%, #ecdcca 100%);}
+.wa-task-card.recommended {outline: 2px solid rgba(226, 145, 87, 0.9); outline-offset: 2px;}
+.wa-task-head {display: flex; justify-content: space-between; gap: 10px; align-items: flex-start; margin-bottom: 10px;}
+.wa-task-name {font-size: 1.08rem; font-weight: 700; color: #3c271b;}
+.wa-badge {
+  display: inline-flex;
+  align-items: center;
+  padding: 4px 8px;
+  border-radius: 999px;
+  background: rgba(170, 123, 84, 0.12);
+  color: #825336;
+  font-size: 0.68rem;
+  font-weight: 700;
+  letter-spacing: 0.08em;
+  text-transform: uppercase;
+}
+.wa-badge.urgent {background: rgba(208, 108, 97, 0.16); color: #8c3c35;}
+.wa-badge.active {background: rgba(151, 179, 120, 0.18); color: #5f7142;}
+.wa-badge.recommended {background: rgba(229, 166, 93, 0.20); color: #86501f;}
+.wa-badge.retry {background: rgba(176, 141, 78, 0.18); color: #7a5a22;}
+.wa-task-meta {display: flex; flex-wrap: wrap; gap: 8px; margin-bottom: 10px;}
+.wa-task-meta span {
+  padding: 5px 8px;
+  border-radius: 999px;
+  background: rgba(179, 139, 104, 0.10);
+  font-size: 0.76rem;
+  color: #77553b;
+}
+.wa-metrics {display: grid; grid-template-columns: repeat(2, minmax(0, 1fr)); gap: 10px 12px;}
+.wa-metric span {display: block; font-size: 0.68rem; letter-spacing: 0.08em; text-transform: uppercase; color: #a26c45;}
+.wa-metric strong {display: block; margin-top: 4px; font-size: 0.98rem; color: #35261d;}
+.wa-empty {
+  padding: 20px;
+  border-radius: 20px;
+  border: 1px dashed rgba(176, 133, 98, 0.26);
+  background: rgba(178, 143, 112, 0.06);
+  color: #7b6656;
+  text-align: center;
+}
+.wa-action-row {display: flex; flex-wrap: wrap; gap: 10px; margin-top: 14px;}
+.wa-button-primary button {background: linear-gradient(135deg, #d97b4b, #c95f34) !important; color: #fff7f0 !important; border: none !important;}
+.wa-button-secondary button {background: #8f5b3b !important; color: #fff8f2 !important; border: none !important;}
+.wa-button-ghost button {background: rgba(180, 132, 96, 0.08) !important; color: #7a4d31 !important; border: 1px solid rgba(180, 132, 96, 0.16) !important;}
+.wa-plot-wrap {padding: 10px 10px 2px;}
+.wa-footer-stack {display: grid; gap: 18px; margin-top: 18px;}
+.wa-accordion {border-radius: 20px !important; overflow: hidden;}
+@media (max-width: 1080px) {
+  .wa-hero,
+  .wa-main {grid-template-columns: 1fr;}
+  .wa-topbar {grid-template-columns: repeat(2, minmax(0, 1fr));}
+}
+@media (max-width: 760px) {
+  .wa-control-grid {grid-template-columns: 1fr;}
+  .wa-banner-grid {grid-template-columns: repeat(2, minmax(0, 1fr));}
+  .wa-topbar {grid-template-columns: 1fr;}
+}
+"""
+def _blank_session() -> Session:
+    return {"env": WorkflowArenaEnvironment(), "observation": None, "history": []}
+def _fmt_num(value: Any, digits: int = 3) -> str:
+    if value is None:
+        return "—"
+    if isinstance(value, float):
+        return f"{value:.{digits}f}"
+    return str(value)
+def _preset_html(preset: str) -> str:
+    brief = PRESET_BRIEFS.get(preset, PRESET_BRIEFS[DifficultyPreset.EASY.value])
+    preset_config = get_preset_config(DifficultyPreset(preset))
+    budget_note = (
+        "No fixed time budget."
+        if preset_config.time_budget_multiplier is None
+        else f"Time budget uses {preset_config.time_budget_multiplier:.2f}x the lower-bound makespan."
+    )
+    return (
+        '<div class="wa-preset-card wa-card">'
+        '<div class="eyebrow">Preset brief</div>'
+        f'<div class="name">{brief["label"]}</div>'
+        f'<div class="summary">{brief["summary"]}</div>'
+        f'<div class="wa-preset-meta">{budget_note} {brief["mechanics"]}</div>'
+        f'<div class="wa-preset-focus"><strong>What matters now:</strong> {brief["focus"]}</div>'
+        "</div>"
+    )
+def _status_text(observation: Any) -> tuple[str, str]:
+    if observation.validation_error:
+        return "Invalid action", "bad"
+    if observation.done and observation.termination_reason:
+        return "Episode terminated", "bad"
+    if observation.done:
+        return "Workflow completed", "good"
+    if observation.free_workers == 0 and observation.running_tasks:
+        return "Wait required", ""
+    if observation.ready_tasks:
+        return "Ready to dispatch", ""
+    return "Waiting on completions", ""
+def _recommended_task_ids(observation: Any) -> list[str]:
+    if observation is None or observation.done or observation.free_workers <= 0:
+        return []
+    ready_tasks = list(observation.ready_tasks)
+    if not ready_tasks:
+        return []
+    time_remaining = observation.time_remaining
+    ranked = sorted(
+        ready_tasks,
+        key=lambda task: (
+            time_remaining is not None and task.duration > time_remaining,
+            max(0, task.duration - time_remaining) if time_remaining is not None else 0,
+            task.slack if task.slack is not None else 1_000_000,
+            task.deadline if task.deadline is not None else 1_000_000,
+            -(task.criticality or 0.0),
+            -task.priority,
+            task.duration,
+            task.task_id,
+        ),
+    )
+    return [task.task_id for task in ranked[: observation.free_workers]]
+def _dispatch_window(observation: Any) -> tuple[int, int, int]:
+    ready_count = len(observation.ready_tasks)
+    free_workers = max(0, observation.free_workers)
+    dispatchable_now = min(ready_count, free_workers)
+    overflow_ready = max(0, ready_count - free_workers)
+    return ready_count, dispatchable_now, overflow_ready
+def _topbar_html(observation: Any) -> str:
+    completed = observation.progress.completed
+    total = max(1, observation.progress.total)
+    score = observation.benchmark_score
+    if score is None:
+        score = observation.success_metrics.benchmark_score
+    time_sub = (
+        f"{observation.time_remaining} remaining"
+        if observation.time_remaining is not None
+        else "simulation clock"
+    )
+    worker_sub = (
+        f"idle / usable of {observation.total_workers}"
+        if getattr(observation, "effective_workers", observation.total_workers) != observation.total_workers
+        else "free / total"
+    )
+    cards = [
+        ("State", _status_text(observation)[0], f"{observation.progress.ready} ready / {observation.progress.running} running"),
+        (
+            "Workers",
+            f"{observation.free_workers}/{getattr(observation, 'effective_workers', observation.total_workers)}",
+            worker_sub,
+        ),
+        ("Completed", f"{completed}/{total}", f"{round(100 * completed / total, 1)}% finished"),
+        ("Reward", _fmt_num(observation.cumulative_reward, 3), "cumulative"),
+        ("Time", observation.current_time, time_sub),
+        ("Score", _fmt_num(score, 3), "terminal if done"),
+    ]
+    return '<div class="wa-topbar">' + "".join(
+        f'<div class="wa-stat"><div class="label">{label}</div><div class="value">{value}</div><div class="sub">{sub}</div></div>'
+        for label, value, sub in cards
+    ) + "</div>"
+def _banner_html(observation: Any) -> str:
+    completed = observation.progress.completed
+    total = max(1, observation.progress.total)
+    progress_pct = round(100 * completed / total, 1)
+    status_text, status_kind = _status_text(observation)
+    banner_class = "wa-banner"
+    if status_kind == "bad":
+        banner_class += " invalid"
+    elif status_kind == "good":
+        banner_class += " done"
+    failure_note = _failure_summary(observation)
+    note = observation.note or "No environment note."
+    if observation.validation_error:
+        note = f"{note} {observation.validation_error}"
+    if failure_note:
+        note = f"{note} {failure_note}"
+    score = observation.benchmark_score
+    if score is None:
+        score = observation.success_metrics.benchmark_score
+    metric_cards = [
+        ("Ready", observation.progress.ready),
+        ("Running", observation.progress.running),
+        ("Workers", f"{getattr(observation, 'effective_workers', observation.total_workers)}/{observation.total_workers}"),
+        (
+            "Time Left",
+            observation.time_remaining if observation.time_remaining is not None else "—",
+        ),
+    ]
+    return (
+        f'<div class="{banner_class}">'
+        '<div class="wa-banner-top">'
+        '<div>'
+        f'<span class="status">{status_text}</span>'
+        f'<div class="meta">Preset: {observation.config.preset.value} • Seed: {observation.config.seed} • Workers: {observation.total_workers}</div>'
+        f'<div class="note">{note}</div>'
+        "</div>"
+        f'<div class="meta">{completed}/{total} complete</div>'
+        "</div>"
+        '<div class="wa-banner-grid">'
+        + "".join(
+            f'<div class="wa-banner-metric"><span>{label}</span><strong>{value}</strong></div>'
+            for label, value in metric_cards
+        )
+        + "</div>"
+        f'<div class="wa-progress"><div class="wa-progress-fill" style="width:{progress_pct:.1f}%"></div></div>'
+        "</div>"
+    )
+def _planner_html(observation: Any) -> str:
+    recommended = _recommended_task_ids(observation)
+    ready_count, dispatchable_now, overflow_ready = _dispatch_window(observation)
+    if observation.done:
+        title = "Episode finished"
+        body = "Reset for another episode or inspect the final timeline and reward trace below."
+    elif observation.validation_error:
+        title = "Fix the last move"
+        body = observation.validation_error
+    elif observation.free_workers == 0 and observation.running_tasks:
+        title = "Advance time"
+        body = "All workers are occupied. Waiting is the only legal move until the next task completes."
+    elif recommended:
+        title = f"Dispatch {', '.join(recommended)}"
+        body = (
+            "These tasks minimize slack first, then prefer tighter deadlines, stronger criticality, and higher priority. "
+            f"The recommendation is capped at `{dispatchable_now}` because only `{observation.free_workers}` worker"
+            f"{'s are' if observation.free_workers != 1 else ' is'} free right now."
+        )
+    else:
+        title = "Hold for completions"
+        body = "No ready work is available. Wait until dependencies unlock new tasks."
+    chips = [
+        f"free workers: {observation.free_workers}",
+        f"usable workers: {getattr(observation, 'effective_workers', observation.total_workers)}",
+        f"ready queue: {ready_count}",
+        f"dispatchable now: {dispatchable_now}",
+        f"last reward: {_fmt_num(observation.reward if hasattr(observation, 'reward') else 0.0, 3)}",
+    ]
+    if observation.time_remaining is not None:
+        chips.append(f"time remaining: {observation.time_remaining}")
+    if overflow_ready:
+        chips.append(f"queued beyond capacity: {overflow_ready}")
+    if observation.running_tasks:
+        next_finish = min(task.end_time or observation.current_time for task in observation.running_tasks)
+        chips.append(f"next completion: t={next_finish}")
+    if observation.degraded_workers:
+        chips.append(f"worker outage: -{observation.degraded_workers} usable")
+    return (
+        '<div class="wa-playbook wa-card">'
+        '<div class="wa-playbook-header">'
+        '<div>'
+        '<h3>Decision support</h3>'
+        f'<div class="wa-playbook-title">{title}</div>'
+        "</div>"
+        f'<div class="wa-chip">{_status_text(observation)[0]}</div>'
+        "</div>"
+        f'<p>{body}</p>'
+        '<div class="wa-chip-row">'
+        + "".join(f'<div class="wa-chip">{chip}</div>' for chip in chips)
+        + "</div>"
+        "</div>"
+    )
+def _capacity_hint(observation: Any) -> str:
+    ready_count, dispatchable_now, overflow_ready = _dispatch_window(observation)
+    if observation.done:
+        return "Episode finished. Review the schedule or reset to try another seed."
+    if observation.validation_error:
+        return (
+            f"Last action was rejected. Select at most {observation.free_workers} ready "
+            f"task{'s' if observation.free_workers != 1 else ''}."
+        )
+    if observation.free_workers == 0 and observation.running_tasks:
+        return "All workers are busy. Use Wait to jump to the next completion."
+    if not observation.ready_tasks:
+        return "No ready tasks available right now. Wait until dependencies unlock more work."
+    overflow_suffix = f" `{overflow_ready}` ready task(s) will stay queued." if overflow_ready else ""
+    return (
+        f"{ready_count} ready task{'s' if ready_count != 1 else ''}. "
+        f"You can dispatch up to {dispatchable_now} right now.{overflow_suffix}"
+    )
+def _task_badges(task: Any, *, running: bool = False, recommended: bool = False) -> str:
+    badges: list[str] = []
+    if task.deadline is not None and task.slack is not None and task.slack <= 1:
+        badges.append('<span class="wa-badge urgent">Urgent</span>')
+    if getattr(task, "attempt_count", 0) > 0:
+        badges.append(
+            f'<span class="wa-badge retry">Retry {getattr(task, "attempt_count", 0) + 1}</span>'
+        )
+    if recommended:
+        badges.append('<span class="wa-badge recommended">Recommended</span>')
+    if running:
+        badges.append('<span class="wa-badge active">Running</span>')
+    if not badges:
+        badges.append('<span class="wa-badge">Ready</span>')
+    return "".join(badges)
+def _task_card(task: Any, *, running: bool = False, recommended: bool = False) -> str:
+    deps = ", ".join(task.dependencies) if task.dependencies else "None"
+    deadline = task.deadline if task.deadline is not None else "—"
+    start = task.start_time if task.start_time is not None else "—"
+    end = task.end_time if task.end_time is not None else "—"
+    classes = ["wa-task-card"]
+    if running:
+        classes.append("running")
+    if recommended:
+        classes.append("recommended")
+    return (
+        f'<div class="{" ".join(classes)}">'
+        '<div class="wa-task-head">'
+        f'<div class="wa-task-name">{task.task_id}</div>'
+        f'<div>{_task_badges(task, running=running, recommended=recommended)}</div>'
+        "</div>"
+        f'<div class="wa-task-meta"><span>deps: {deps}</span><span>downstream: {task.downstream_count}</span><span>attempts: {getattr(task, "attempt_count", 0) + 1}</span></div>'
+        '<div class="wa-metrics">'
+        f'<div class="wa-metric"><span>Deadline</span><strong>{deadline}</strong></div>'
+        f'<div class="wa-metric"><span>Duration</span><strong>{task.duration}</strong></div>'
+        f'<div class="wa-metric"><span>Priority</span><strong>{task.priority}</strong></div>'
+        f'<div class="wa-metric"><span>Criticality</span><strong>{_fmt_num(task.criticality, 3)}</strong></div>'
+        f'<div class="wa-metric"><span>Slack</span><strong>{_fmt_num(task.slack, 1)}</strong></div>'
+        f'<div class="wa-metric"><span>{"Finish" if running else "Start"}</span><strong>{end if running else start}</strong></div>'
+        "</div>"
+        "</div>"
+    )
+def _cards_html(tasks: list[Any], *, running: bool = False, recommended_ids: set[str] | None = None) -> str:
+    if not tasks:
+        message = "No tasks in this lane yet." if running else "No ready tasks available."
+        return f'<div class="wa-empty">{message}</div>'
+    recommended_ids = recommended_ids or set()
+    return '<div class="wa-card-grid">' + "".join(
+        _task_card(task, running=running, recommended=task.task_id in recommended_ids)
+        for task in tasks
+    ) + "</div>"
+def _timeline_figure(observation: Any) -> go.Figure:
+    fig = go.Figure()
+    completed = sorted(observation.completed_tasks, key=lambda task: task.task_id)
+    running = sorted(observation.running_tasks, key=lambda task: task.task_id)
+    ready = sorted(observation.ready_tasks, key=lambda task: task.task_id)
+    timeline_tasks = completed + running
+    task_ids = [task.task_id for task in timeline_tasks] + [task.task_id for task in ready]
+    if completed:
+        fig.add_trace(
+            go.Bar(
+                x=[max(0, (task.end_time or 0) - (task.start_time or 0)) for task in completed],
+                y=[task.task_id for task in completed],
+                base=[task.start_time or 0 for task in completed],
+                orientation="h",
+                name="Completed",
+                marker_color="#85c88a",
+                hovertemplate=(
+                    "<b>%{y}</b><br>Status: Completed<br>Start: %{base}<br>"
+                    "Duration: %{x}<extra></extra>"
+                ),
+            )
+        )
+    if running:
+        fig.add_trace(
+            go.Bar(
+                x=[
+                    max(
+                        0,
+                        (task.end_time or observation.current_time) - (task.start_time or observation.current_time),
+                    )
+                    for task in running
+                ],
+                y=[task.task_id for task in running],
+                base=[task.start_time or observation.current_time for task in running],
+                orientation="h",
+                name="Running",
+                marker_color="#d88a5b",
+                hovertemplate=(
+                    "<b>%{y}</b><br>Status: Running<br>Start: %{base}<br>"
+                    "Allocated span: %{x}<extra></extra>"
+                ),
+            )
+        )
+    if ready:
+        fig.add_trace(
+            go.Scatter(
+                x=[observation.current_time] * len(ready),
+                y=[task.task_id for task in ready],
+                mode="markers",
+                name="Ready",
+                marker=dict(color="#9e6a43", size=11, symbol="diamond"),
+                customdata=[[task.deadline, task.priority, task.duration] for task in ready],
+                hovertemplate=(
+                    "<b>%{y}</b><br>Status: Ready<br>Current time: %{x}<br>"
+                    "Deadline: %{customdata[0]}<br>Priority: %{customdata[1]}<br>"
+                    "Duration: %{customdata[2]}<extra></extra>"
+                ),
+            )
+        )
+    if not task_ids:
+        task_ids = ["No tasks yet"]
+        fig.add_annotation(
+            text="Reset an episode to populate the workflow timeline.",
+            x=0.5,
+            y=0.5,
+            xref="paper",
+            yref="paper",
+            showarrow=False,
+            font=dict(color="#8c6f58", size=14),
+        )
+    horizon_candidates = [observation.current_time + 1]
+    horizon_candidates.extend(task.end_time or 0 for task in completed)
+    horizon_candidates.extend(task.end_time or observation.current_time for task in running)
+    x_max = max(horizon_candidates) + 1
+    fig.add_vline(
+        x=observation.current_time,
+        line_width=2,
+        line_dash="dash",
+        line_color="#9f6b48",
+        annotation_text="Now",
+        annotation_position="top left",
+    )
+    fig.update_layout(
+        barmode="overlay",
+        height=max(280, 90 + 34 * len(task_ids)),
+        margin=dict(l=10, r=10, t=44, b=18),
+        paper_bgcolor="#ffffff",
+        plot_bgcolor="#ffffff",
+        font=dict(color="#4d382b", family="IBM Plex Sans, Arial, sans-serif"),
+        legend=dict(orientation="h", yanchor="bottom", y=1.02, x=0),
+        title=dict(text="Workflow Timeline", x=0.02, font=dict(size=18)),
+        xaxis=dict(
+            title="Simulated Time",
+            range=[0, x_max],
+            gridcolor="#eadfce",
+            zeroline=False,
+            linecolor="#cfb79c",
+            title_font=dict(color="#6c4a33"),
+            tickfont=dict(color="#6c4a33"),
+        ),
+        yaxis=dict(
+            title="Tasks",
+            categoryorder="array",
+            categoryarray=list(reversed(task_ids)),
+            gridcolor="#f2e8da",
+            linecolor="#cfb79c",
+            title_font=dict(color="#6c4a33"),
+            tickfont=dict(color="#6c4a33"),
+        ),
+    )
+    return fig
+def _detail_rows(tasks: list[Any]) -> list[list[Any]]:
+    return [
+        [
+            task.task_id,
+            task.priority,
+            task.duration,
+            task.deadline if task.deadline is not None else "—",
+            _fmt_num(task.criticality, 3),
+            _fmt_num(task.slack, 1),
+            len(task.dependencies),
+            task.downstream_count,
+            getattr(task, "attempt_count", 0) + 1,
+            task.start_time if task.start_time is not None else "—",
+            task.end_time if task.end_time is not None else "—",
+        ]
+        for task in tasks
+    ]
+def _failure_summary(observation: Any) -> str:
+    events = getattr(observation, "recent_failure_events", []) or []
+    if not events:
+        return ""
+    return " ".join(event.detail for event in events if getattr(event, "detail", ""))
+def _selection_markdown(selected_task_ids: list[str], observation: Any) -> str:
+    if observation is None:
+        return "No episode yet. Reset an episode to start building a dispatch batch."
+    task_map = {task.task_id: task for task in observation.ready_tasks}
+    selected_tasks = [task_map[task_id] for task_id in selected_task_ids if task_id in task_map]
+    capacity = max(0, observation.free_workers)
+    ready_count, dispatchable_now, overflow_ready = _dispatch_window(observation)
+    if not selected_tasks:
+        recommended = _recommended_task_ids(observation)
+        if not recommended:
+            return (
+                f"**Dispatch builder**\n\nReady queue: `{ready_count}`. "
+                f"Dispatchable now: `{dispatchable_now}`."
+            )
+        overflow_suffix = f" `{overflow_ready}` ready task(s) stay queued after dispatch." if overflow_ready else ""
+        return (
+            f"**Dispatch builder**\n\nNo tasks selected yet. Ready queue: `{ready_count}`. "
+            f"Recommended batch: `{', '.join(recommended)}`. Dispatch cap now: `{dispatchable_now}`.{overflow_suffix}"
+        )
+    total_priority = sum(task.priority for task in selected_tasks)
+    shortest_finish = observation.current_time + min(task.duration for task in selected_tasks)
+    longest_finish = observation.current_time + max(task.duration for task in selected_tasks)
+    warnings: list[str] = []
+    if len(selected_tasks) > capacity:
+        warnings.append(f"Selection exceeds capacity by `{len(selected_tasks) - capacity}`.")
+    warning_text = "\n\n" + " ".join(warnings) if warnings else ""
+    return (
+        f"**Dispatch builder**\n\nSelected `{len(selected_tasks)}` task(s) for `{capacity}` free slot(s). "
+        f"Priority sum: `{total_priority}`. Earliest completion: `t={shortest_finish}`. "
+        f"Longest in-flight span: `t={longest_finish}`.{warning_text}"
+    )
+def _reward_markdown(observation: Any) -> str:
+    breakdown = observation.last_reward_breakdown
+    rows = [
+        ("completion", breakdown.completion_reward),
+        ("utilization", breakdown.utilization_reward),
+        ("deadline", breakdown.deadline_reward),
+        ("criticality", breakdown.criticality_reward),
+        ("idle", breakdown.idle_penalty),
+        ("invalid", breakdown.invalid_action_penalty),
+        ("terminal", breakdown.terminal_makespan_score),
+        ("unfinished", breakdown.unfinished_task_penalty),
+    ]
+    lines = ["| Channel | Value |", "| --- | ---: |"]
+    lines.extend(f"| {label} | {value:.3f} |" for label, value in rows)
+    return "\n".join(lines)
+def _history_markdown(history: list[dict[str, Any]]) -> str:
+    if not history:
+        return "No actions yet."
+    lines: list[str] = []
+    for item in history[-12:]:
+        reward = _fmt_num(item.get("reward"), 3)
+        suffix = f" • error: `{item['error']}`" if item.get("error") else ""
+        note = item.get("note") or ""
+        lines.append(f"**{item['label']}** at `t={item['time']}` • reward `{reward}`{suffix}  \n{note}")
+    return "\n\n".join(reversed(lines))
+def _blank_observation_view() -> Any:
+    return SimpleNamespace(
+        reward=0.0,
+        progress=SimpleNamespace(completed=0, ready=0, running=0, blocked=0, total=1),
+        benchmark_score=None,
+        success_metrics=SimpleNamespace(benchmark_score=None, unfinished_task_count=0),
+        free_workers=0,
+        effective_workers=0,
+        degraded_workers=0,
+        total_workers=0,
+        time_budget=None,
+        time_remaining=None,
+        cumulative_reward=0.0,
+        current_time=0,
+        done=False,
+        termination_reason=None,
+        validation_error=None,
+        completed_tasks=[],
+        ready_tasks=[],
+        running_tasks=[],
+        blocked_tasks=[],
+        recent_failure_events=[],
+        note="Reset an episode to start scheduling.",
+        config=SimpleNamespace(preset=SimpleNamespace(value=DifficultyPreset.EASY.value), seed=0),
+    )
+def _empty_updates(session: Session):
+    empty_rows: list[list[Any]] = []
+    blank = _blank_observation_view()
+    return (
+        session,
+        _preset_html(DifficultyPreset.EASY.value),
+        _topbar_html(blank),
+        _banner_html(blank),
+        _planner_html(blank),
+        "No episode yet.",
+        _selection_markdown([], blank),
+        '<div class="wa-empty">Reset an episode to see ready tasks.</div>',
+        gr.update(choices=[], value=[]),
+        gr.update(interactive=False),
+        gr.update(interactive=False),
+        gr.update(interactive=False),
+        gr.update(interactive=False),
+        gr.update(interactive=False),
+        '<div class="wa-empty">Running tasks will appear here after dispatch.</div>',
+        _timeline_figure(blank),
+        "| Channel | Value |\n| --- | ---: |\n| — | — |",
+        "No actions yet.",
+        empty_rows,
+        empty_rows,
+        empty_rows,
+    )
+def _render(session: Session):
+    observation = session.get("observation")
+    env = session.get("env")
+    history = session.get("history", [])
+    if observation is None:
+        return _empty_updates(session)
+    if env is None:
+        completed_rows = _detail_rows(observation.completed_tasks)
+        blocked_rows = _detail_rows(observation.blocked_tasks)
+    else:
+        completed_rows = _detail_rows(env.debug_task_views_for_status(TaskStatus.COMPLETED))
+        blocked_rows = _detail_rows(env.debug_task_views_for_status(TaskStatus.BLOCKED))
+    ready_choices = [task.task_id for task in observation.ready_tasks]
+    recommended_ids = _recommended_task_ids(observation)
+    can_recommend = bool(recommended_ids) and not observation.done
+    can_wait = bool(observation.running_tasks) and not observation.done
+    can_clear = bool(ready_choices) and not observation.done
+    return (
+        session,
+        _preset_html(observation.config.preset.value),
+        _topbar_html(observation),
+        _banner_html(observation),
+        _planner_html(observation),
+        _capacity_hint(observation),
+        _selection_markdown([], observation),
+        _cards_html(observation.ready_tasks, running=False, recommended_ids=set(recommended_ids)),
+        gr.update(choices=ready_choices, value=[]),
+        gr.update(interactive=False),
+        gr.update(interactive=can_wait),
+        gr.update(interactive=can_recommend),
+        gr.update(interactive=can_recommend),
+        gr.update(interactive=can_clear),
+        _cards_html(observation.running_tasks, running=True),
+        _timeline_figure(observation),
+        _reward_markdown(observation),
+        _history_markdown(history),
+        empty_rows := _detail_rows([]),
+        completed_rows,
+        blocked_rows,
+    )
+def _append_history(
+    session: Session,
+    label: str,
+    observation: Any,
+    *,
+    reward: float | None = None,
+    error: str | None = None,
+) -> Session:
+    history = list(session.get("history", []))
+    history.append(
+        {
+            "label": label,
+            "time": observation.current_time,
+            "reward": reward,
+            "error": error,
+            "note": observation.note,
+        }
+    )
+    session["history"] = history
+    return session
+def _reset(preset: str, seed: float, worker_count: float, session: Session):
+    env = session.get("env") or WorkflowArenaEnvironment()
+    observation = env.reset(
+        preset=preset,
+        seed=int(seed),
+        worker_count=int(worker_count),
+    )
+    next_session = {"env": env, "observation": observation, "history": []}
+    next_session = _append_history(
+        next_session,
+        f"reset • preset `{preset}` • workers `{int(worker_count)}`",
+        observation,
+        reward=0.0,
+        error=None,
+    )
+    return _render(next_session)
+def _dispatch(selected_task_ids: list[str], session: Session):
+    observation = session.get("observation")
+    env = session.get("env")
+    if env is None or observation is None:
+        return _render(_blank_session())
+    action = WorkflowArenaAction(
+        action_type=WorkflowActionType.DISPATCH,
+        task_ids=selected_task_ids,
+    )
+    next_observation = env.step(action)
+    next_session = {
+        "env": env,
+        "observation": next_observation,
+        "history": session.get("history", []),
+    }
+    label = "dispatch " + (", ".join(selected_task_ids) if selected_task_ids else "(none)")
+    next_session = _append_history(
+        next_session,
+        label,
+        next_observation,
+        reward=next_observation.reward,
+        error=next_observation.validation_error,
+    )
+    return _render(next_session)
+def _dispatch_recommended(session: Session):
+    observation = session.get("observation")
+    if observation is None:
+        return _render(_blank_session())
+    return _dispatch(_recommended_task_ids(observation), session)
+def _wait(session: Session):
+    observation = session.get("observation")
+    env = session.get("env")
+    if env is None or observation is None:
+        return _render(_blank_session())
+    action = WorkflowArenaAction(
+        action_type=WorkflowActionType.WAIT,
+        task_ids=[],
+    )
+    next_observation = env.step(action)
+    next_session = {
+        "env": env,
+        "observation": next_observation,
+        "history": session.get("history", []),
+    }
+    next_session = _append_history(
+        next_session,
+        "wait",
+        next_observation,
+        reward=next_observation.reward,
+        error=next_observation.validation_error,
+    )
+    return _render(next_session)
+def _update_selection(selected_task_ids: list[str], session: Session):
+    observation = session.get("observation")
+    if observation is None:
+        return "No episode yet.", [], gr.update(interactive=False)
+    task_map = {task.task_id: task for task in observation.ready_tasks}
+    selected_tasks = [task_map[task_id] for task_id in selected_task_ids if task_id in task_map]
+    can_dispatch = (
+        bool(selected_tasks)
+        and len(selected_tasks) == len(selected_task_ids)
+        and len(selected_task_ids) <= observation.free_workers
+        and not observation.done
+    )
+    return (
+        _selection_markdown(selected_task_ids, observation),
+        _detail_rows(selected_tasks),
+        gr.update(interactive=can_dispatch),
+    )
+def _select_recommended(session: Session):
+    observation = session.get("observation")
+    if observation is None:
+        return gr.update(value=[]), "No episode yet.", [], gr.update(interactive=False)
+    recommended_ids = _recommended_task_ids(observation)
+    task_map = {task.task_id: task for task in observation.ready_tasks}
+    selected_tasks = [task_map[task_id] for task_id in recommended_ids if task_id in task_map]
+    return (
+        gr.update(value=recommended_ids),
+        _selection_markdown(recommended_ids, observation),
+        _detail_rows(selected_tasks),
+        gr.update(interactive=bool(recommended_ids) and not observation.done),
+    )
+def _clear_selection(session: Session):
+    observation = session.get("observation")
+    return (
+        gr.update(value=[]),
+        _selection_markdown([], observation),
+        [],
+        gr.update(interactive=False),
+    )
+def _random_seed() -> int:
+    return random.randint(0, 999_999)
+def _preset_controls_update(preset: str):
+    preset_config = get_preset_config(DifficultyPreset(preset))
+    return _preset_html(preset), gr.update(value=preset_config.worker_count)
+def create_gradio_app() -> gr.Blocks:
+    with gr.Blocks(title="WorkflowArena") as demo:
+        session = gr.State(_blank_session())
+        with gr.Column(elem_classes=["wa-shell"]):
+            gr.HTML(f"<style>{CSS}</style>")
+            gr.HTML(
+                """
+                <div class="wa-title">
+                  <h1>WorkflowArena</h1>
+                  <p>Run a workflow episode like a control room instead of a raw form. Reset a seeded DAG, inspect urgency and capacity, build a legal dispatch batch, then advance time when workers are saturated.</p>
+                </div>
+                """
+            )
+            with gr.Row(elem_classes=["wa-hero"]):
+                with gr.Column(elem_classes=["wa-control-card", "wa-card"]):
+                    gr.HTML("<h3>Episode controls</h3><p>Change the preset, seed, or worker count, then reset to generate a new scheduling problem.</p>")
+                    with gr.Accordion("Problem Framing", open=False, elem_classes=["wa-accordion", "wa-compact-accordion"]):
+                        gr.HTML(
+                            """
+                            <div class="wa-problem-box">
+                              <p><strong>What this problem is:</strong> You are scheduling a workflow where tasks depend on each other and workers are limited. At every step, the legal move is either to dispatch ready tasks to free workers or wait for the next completion event.</p>
+                              <p><strong>What good play looks like:</strong> Finish urgent and high-value work on time, keep workers utilized, and avoid delaying the critical path. Higher difficulties add a time budget and, in hard mode, failure events that reduce usable capacity or force retries.</p>
+                            </div>
+                            """
+                        )
+                    with gr.Row(elem_classes=["wa-control-grid"]):
+                        preset = gr.Dropdown(
+                            label="Preset",
+                            choices=[preset.value for preset in DifficultyPreset],
+                            value=DifficultyPreset.EASY.value,
+                            interactive=True,
+                        )
+                        seed = gr.Number(label="Seed", value=0, precision=0, minimum=0)
+                        workers = gr.Slider(
+                            minimum=1,
+                            maximum=6,
+                            step=1,
+                            value=3,
+                            label="Workers",
+                            interactive=True,
+                        )
+                    with gr.Row(elem_classes=["wa-control-buttons", "wa-inline-buttons"]):
+                        reset_button = gr.Button(
+                            "Reset Episode",
+                            variant="primary",
+                            elem_classes=["wa-button-primary"],
+                        )
+                        random_seed_button = gr.Button(
+                            "Random Seed",
+                            variant="secondary",
+                            elem_classes=["wa-button-ghost"],
+                        )
+                preset_brief = gr.HTML(value=_preset_html(DifficultyPreset.EASY.value))
+            topbar = gr.HTML()
+            banner = gr.HTML()
+            planner = gr.HTML()
+            with gr.Row(elem_classes=["wa-main"]):
+                with gr.Column(elem_classes=["wa-left-stack"]):
+                    with gr.Column(elem_classes=["wa-panel", "wa-card"]):
+                        gr.HTML(
+                            """
+                            <div class="wa-lane-header">
+                              <div>
+                                <div class="wa-lane-title">Dispatch Lane</div>
+                                <div class="wa-lane-copy">Inspect ready tasks, build a batch, and send work only when the action is legal.</div>
+                              </div>
+                            </div>
+                            """
+                        )
+                        selection_summary = gr.Markdown(elem_classes=["wa-hint"])
+                        ready_cards = gr.HTML()
+                        ready_selector = gr.CheckboxGroup(
+                            label="Build dispatch batch",
+                            info="Choose up to the number of currently free workers.",
+                        )
+                        with gr.Row(elem_classes=["wa-action-row"]):
+                            select_recommended_button = gr.Button(
+                                "Select Recommended",
+                                variant="secondary",
+                                interactive=False,
+                                elem_classes=["wa-button-secondary"],
+                            )
+                            dispatch_recommended_button = gr.Button(
+                                "Dispatch Recommended",
+                                variant="primary",
+                                interactive=False,
+                                elem_classes=["wa-button-primary"],
+                            )
+                            clear_selection_button = gr.Button(
+                                "Clear Selection",
+                                variant="secondary",
+                                interactive=False,
+                                elem_classes=["wa-button-ghost"],
+                            )
+                            dispatch_button = gr.Button(
+                                "Dispatch Selected",
+                                variant="primary",
+                                interactive=False,
+                                elem_classes=["wa-button-primary"],
+                            )
+                            wait_button = gr.Button(
+                                "Wait",
+                                variant="secondary",
+                                interactive=False,
+                                elem_classes=["wa-button-secondary"],
+                            )
+                    with gr.Column(elem_classes=["wa-panel", "wa-card"]):
+                        gr.HTML(
+                            """
+                            <div class="wa-lane-header">
+                              <div>
+                                <div class="wa-lane-title">Selected Batch</div>
+                                <div class="wa-lane-copy">This preview updates as you pick tasks from the current ready queue.</div>
+                              </div>
+                            </div>
+                            """
+                        )
+                        selected_table = gr.Dataframe(
+                            headers=DETAIL_HEADERS,
+                            value=[],
+                            interactive=False,
+                            wrap=True,
+                            label="Dispatch preview",
+                        )
+                with gr.Column(elem_classes=["wa-right-stack"]):
+                    with gr.Column(elem_classes=["wa-panel", "wa-card"]):
+                        gr.HTML(
+                            """
+                            <div class="wa-lane-header">
+                              <div>
+                                <div class="wa-lane-title">Mission Control</div>
+                                <div class="wa-lane-copy">Use the recommendation as a guide, not a rule. The reward trace below tells you whether the choice paid off.</div>
+                              </div>
+                            </div>
+                            """
+                        )
+                        gr.Markdown(elem_classes=["wa-hint"], value="No episode yet.")
+                        decision_hint = gr.Markdown(elem_classes=["wa-hint"])
+                        running_cards = gr.HTML()
+                    with gr.Column(elem_classes=["wa-plot-wrap", "wa-card"]):
+                        timeline_plot = gr.Plot(label="Workflow Timeline")
+                    with gr.Accordion("Reward Breakdown", open=False, elem_classes=["wa-accordion"]):
+                        reward_markdown = gr.Markdown()
+                    with gr.Accordion("Action History", open=False, elem_classes=["wa-accordion"]):
+                        history_markdown = gr.Markdown()
+            with gr.Row(elem_classes=["wa-footer-stack"]):
+                with gr.Accordion("Completed Tasks", open=False, elem_classes=["wa-accordion"]):
+                    completed_table = gr.Dataframe(
+                        headers=DETAIL_HEADERS,
+                        value=[],
+                        interactive=False,
+                        wrap=True,
+                        label="Completed",
+                    )
+                with gr.Accordion("Blocked Tasks", open=False, elem_classes=["wa-accordion"]):
+                    blocked_table = gr.Dataframe(
+                        headers=DETAIL_HEADERS,
+                        value=[],
+                        interactive=False,
+                        wrap=True,
+                        label="Blocked",
+                    )
+        outputs = [
+            session,
+            preset_brief,
+            topbar,
+            banner,
+            planner,
+            decision_hint,
+            selection_summary,
+            ready_cards,
+            ready_selector,
+            dispatch_button,
+            wait_button,
+            select_recommended_button,
+            dispatch_recommended_button,
+            clear_selection_button,
+            running_cards,
+            timeline_plot,
+            reward_markdown,
+            history_markdown,
+            selected_table,
+            completed_table,
+            blocked_table,
+        ]
+        random_seed_button.click(_random_seed, outputs=[seed])
+        preset.change(_preset_controls_update, inputs=[preset], outputs=[preset_brief, workers])
+        reset_button.click(
+            _reset,
+            inputs=[preset, seed, workers, session],
+            outputs=outputs,
+        )
+        dispatch_button.click(
+            _dispatch,
+            inputs=[ready_selector, session],
+            outputs=outputs,
+        )
+        dispatch_recommended_button.click(
+            _dispatch_recommended,
+            inputs=[session],
+            outputs=outputs,
+        )
+        wait_button.click(
+            _wait,
+            inputs=[session],
+            outputs=outputs,
+        )
+        ready_selector.change(
+            _update_selection,
+            inputs=[ready_selector, session],
+            outputs=[selection_summary, selected_table, dispatch_button],
+        )
+        select_recommended_button.click(
+            _select_recommended,
+            inputs=[session],
+            outputs=[ready_selector, selection_summary, selected_table, dispatch_button],
+        )
+        clear_selection_button.click(
+            _clear_selection,
+            inputs=[session],
+            outputs=[ready_selector, selection_summary, selected_table, dispatch_button],
+        )
+        demo.load(lambda: _empty_updates(_blank_session()), outputs=outputs)
+    return demo

server/workflow_arena_environment.py ADDED Viewed

	@@ -0,0 +1,873 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""WorkflowArena event-driven workflow orchestration environment."""
+from __future__ import annotations
+import math
+import random
+from typing import Any
+from uuid import uuid4
+from openenv.core.env_server.interfaces import Environment
+from openenv.core.env_server.types import State
+from workflow_arena.generator import generate_episode
+from workflow_arena.models import (
+    DifficultyPreset,
+    EpisodeConfig,
+    FailureEventType,
+    ProgressSummary,
+    RewardBreakdown,
+    SuccessMetrics,
+    TaskStatus,
+    WorkflowArenaAction,
+    WorkflowArenaObservation,
+    WorkflowEnvStateSnapshot,
+    WorkflowEpisodeSpec,
+    WorkflowFailureEvent,
+    WorkflowTaskSpec,
+    WorkflowTaskView,
+    WorkflowActionType,
+)
+from workflow_arena.presets import get_preset_config
+class WorkflowArenaEnvironment(Environment):
+    """Resource-constrained workflow scheduler with event-driven semantics."""
+    SUPPORTS_CONCURRENT_SESSIONS: bool = True
+    STEP_LIMIT_FLOOR: int = 32
+    STEP_LIMIT_MULTIPLIER: int = 8
+    INVALID_ACTION_PENALTY: float = -0.1
+    OVERCAPACITY_INVALID_ACTION_PENALTY: float = -0.25
+    AVOIDABLE_WAIT_PENALTY_PER_SLOT: float = -0.08
+    UNFINISHED_PRIORITY_PENALTY: float = -0.02
+    OVERDUE_PRIORITY_PENALTY_PER_TICK: float = -0.005
+    MAX_RECENT_FAILURE_EVENTS: int = 6
+    def __init__(self):
+        self._state = State(episode_id=str(uuid4()), step_count=0)
+        self._cumulative_reward = 0.0
+        self._max_episode_steps = self.STEP_LIMIT_FLOOR
+        self._config = EpisodeConfig(
+            preset=DifficultyPreset.EASY,
+            seed=0,
+            worker_count=2,
+        )
+        self._episode_spec: WorkflowEpisodeSpec | None = None
+        self._env_state: WorkflowEnvStateSnapshot | None = None
+        self._event_rng = random.Random(0)
+    def _require_episode(self) -> tuple[WorkflowEpisodeSpec, WorkflowEnvStateSnapshot]:
+        if self._episode_spec is None or self._env_state is None:
+            raise RuntimeError("Environment must be reset before use.")
+        return self._episode_spec, self._env_state
+    def _preset_config(self):
+        episode, _ = self._require_episode()
+        return episode.preset_config
+    def _task_map(self) -> dict[str, WorkflowTaskSpec]:
+        episode, _ = self._require_episode()
+        return {task.task_id: task for task in episode.tasks}
+    def _effective_worker_capacity(
+        self, env_state: WorkflowEnvStateSnapshot | None = None
+    ) -> int:
+        if env_state is None:
+            _, env_state = self._require_episode()
+        return max(0, self._config.worker_count - env_state.degraded_workers)
+    def _time_remaining(
+        self, env_state: WorkflowEnvStateSnapshot | None = None
+    ) -> int | None:
+        if env_state is None:
+            _, env_state = self._require_episode()
+        if env_state.time_budget is None:
+            return None
+        return max(0, env_state.time_budget - env_state.current_time)
+    def _terminal_score(self) -> float:
+        episode, env_state = self._require_episode()
+        if env_state.current_time <= 0:
+            return 0.0
+        lower_bound = self._lower_bound_makespan(episode)
+        score = lower_bound / max(lower_bound, env_state.current_time)
+        return round(score, 4)
+    def _benchmark_score(self) -> float:
+        makespan_score, deadline_score, utilization_score = self._grade_components(
+            include_terminal_makespan=True
+        )
+        return round(
+            (0.5 * makespan_score) + (0.3 * deadline_score) + (0.2 * utilization_score),
+            4,
+        )
+    def _grade_components(
+        self, *, include_terminal_makespan: bool = False
+    ) -> tuple[float, float, float]:
+        episode, env_state = self._require_episode()
+        utilization = (
+            env_state.cumulative_busy_time
+            / (env_state.current_time * self._config.worker_count)
+            if env_state.current_time > 0
+            else 0.0
+        )
+        total_priority = sum(task.priority for task in episode.tasks) or 1
+        on_time_priority = 0
+        for task in episode.tasks:
+            end_time = env_state.task_end_times.get(task.task_id)
+            if end_time is None:
+                continue
+            if task.deadline is None or end_time <= task.deadline:
+                on_time_priority += task.priority
+        deadline_score = round(on_time_priority / total_priority, 4)
+        utilization_score = round(utilization, 4)
+        makespan_score = self._terminal_score() if include_terminal_makespan else 0.0
+        return makespan_score, deadline_score, utilization_score
+    def _unfinished_task_penalty(self, current_time: int) -> float:
+        episode, env_state = self._require_episode()
+        penalty = 0.0
+        for task in episode.tasks:
+            if env_state.task_statuses[task.task_id] == TaskStatus.COMPLETED:
+                continue
+            penalty += self.UNFINISHED_PRIORITY_PENALTY * task.priority
+            if task.deadline is not None and current_time > task.deadline:
+                penalty += (
+                    self.OVERDUE_PRIORITY_PENALTY_PER_TICK
+                    * task.priority
+                    * (current_time - task.deadline)
+                )
+        return round(penalty, 4)
+    def _success_metrics(
+        self, *, benchmark_score_override: float | None = None
+    ) -> SuccessMetrics:
+        episode, env_state = self._require_episode()
+        unfinished_task_count = sum(
+            1
+            for task in episode.tasks
+            if env_state.task_statuses[task.task_id] != TaskStatus.COMPLETED
+        )
+        deadline_miss_count = sum(
+            1
+            for task in episode.tasks
+            if env_state.task_statuses[task.task_id] == TaskStatus.COMPLETED
+            and task.deadline is not None
+            and env_state.task_end_times.get(task.task_id, 0) > task.deadline
+        )
+        _, deadline_score, utilization_score = self._grade_components(
+            include_terminal_makespan=False
+        )
+        all_done = unfinished_task_count == 0
+        return SuccessMetrics(
+            makespan=env_state.current_time if all_done else None,
+            worker_utilization=utilization_score,
+            deadline_miss_count=deadline_miss_count,
+            unfinished_task_count=unfinished_task_count,
+            weighted_priority_completion=deadline_score,
+            benchmark_score=benchmark_score_override,
+        )
+    def _task_view(
+        self,
+        task: WorkflowTaskSpec,
+        status: TaskStatus,
+        *,
+        include_planner_hints: bool = True,
+    ) -> WorkflowTaskView:
+        _, env_state = self._require_episode()
+        return WorkflowTaskView(
+            task_id=task.task_id,
+            status=status,
+            duration=task.duration,
+            priority=task.priority,
+            dependencies=task.dependencies,
+            deadline=task.deadline,
+            criticality=task.criticality if include_planner_hints else None,
+            slack=float(task.slack) if include_planner_hints else None,
+            downstream_count=task.downstream_count if include_planner_hints else 0,
+            start_time=env_state.task_start_times.get(task.task_id),
+            end_time=(
+                env_state.task_end_times.get(task.task_id)
+                or env_state.task_assigned_finish_times.get(task.task_id)
+            ),
+            attempt_count=env_state.task_attempt_counts.get(task.task_id, 0),
+        )
+    def _task_views_for_status(self, status: TaskStatus) -> list[WorkflowTaskView]:
+        episode, env_state = self._require_episode()
+        return [
+            self._task_view(task, status, include_planner_hints=True)
+            for task in episode.tasks
+            if env_state.task_statuses[task.task_id] == status
+        ]
+    def debug_task_views_for_status(self, status: TaskStatus) -> list[WorkflowTaskView]:
+        return self._task_views_for_status(status)
+    def _set_recent_failure_events(
+        self,
+        env_state: WorkflowEnvStateSnapshot,
+        events: list[WorkflowFailureEvent],
+    ) -> None:
+        env_state.recent_failure_events = events[-self.MAX_RECENT_FAILURE_EVENTS :]
+    def _maybe_end_worker_outage(
+        self,
+        env_state: WorkflowEnvStateSnapshot,
+        events: list[WorkflowFailureEvent],
+    ) -> None:
+        if (
+            env_state.active_worker_outage_until is not None
+            and env_state.current_time >= env_state.active_worker_outage_until
+        ):
+            events.append(
+                WorkflowFailureEvent(
+                    event_type=FailureEventType.WORKER_OUTAGE_END,
+                    time=env_state.current_time,
+                    worker_delta=1,
+                    detail="Worker capacity restored.",
+                )
+            )
+            env_state.active_worker_outage_until = None
+            env_state.degraded_workers = 0
+    def _maybe_start_worker_outage(
+        self,
+        env_state: WorkflowEnvStateSnapshot,
+        events: list[WorkflowFailureEvent],
+    ) -> None:
+        preset_config = self._preset_config()
+        if self._config.preset != DifficultyPreset.HARD:
+            return
+        if env_state.active_worker_outage_until is not None:
+            return
+        if preset_config.worker_outage_rate <= 0.0:
+            return
+        if self._event_rng.random() >= preset_config.worker_outage_rate:
+            return
+        duration = self._event_rng.randint(
+            preset_config.worker_outage_duration_min,
+            preset_config.worker_outage_duration_max,
+        )
+        if duration <= 0:
+            return
+        env_state.degraded_workers = min(1, self._config.worker_count)
+        env_state.active_worker_outage_until = env_state.current_time + duration
+        events.append(
+            WorkflowFailureEvent(
+                event_type=FailureEventType.WORKER_OUTAGE_START,
+                time=env_state.current_time,
+                worker_delta=-env_state.degraded_workers,
+                duration=duration,
+                detail=f"Worker outage active until t={env_state.active_worker_outage_until}.",
+            )
+        )
+    def _should_retry_fail(self, task_id: str) -> bool:
+        preset_config = self._preset_config()
+        _, env_state = self._require_episode()
+        if self._config.preset != DifficultyPreset.HARD:
+            return False
+        if preset_config.task_retry_failure_rate <= 0.0:
+            return False
+        if env_state.task_attempt_counts.get(task_id, 0) >= preset_config.max_task_retries:
+            return False
+        return self._event_rng.random() < preset_config.task_retry_failure_rate
+    def _dispatch_potential(
+        self,
+        env_state: WorkflowEnvStateSnapshot,
+        task_map: dict[str, WorkflowTaskSpec],
+    ) -> tuple[float, float]:
+        if not env_state.running_task_ids:
+            return 0.0, 0.0
+        episode, _ = self._require_episode()
+        max_slack = max((task.slack for task in episode.tasks), default=0)
+        utilization_component = 0.06 * (
+            len(env_state.running_task_ids) / max(1, self._config.worker_count)
+        )
+        criticality_component = 0.0
+        for task_id in env_state.running_task_ids:
+            task = task_map[task_id]
+            slack_urgency = 1.0 if max_slack <= 0 else 1.0 - (task.slack / max_slack)
+            criticality_component += (0.6 * task.criticality) + (0.4 * slack_urgency)
+        criticality_component = 0.04 * (
+            criticality_component / max(1, self._config.worker_count)
+        )
+        return round(utilization_component, 4), round(criticality_component, 4)
+    def _base_observation(
+        self,
+        *,
+        reward: float,
+        breakdown: RewardBreakdown,
+        note: str,
+        done: bool,
+        benchmark_score_override: float | None = None,
+    ) -> WorkflowArenaObservation:
+        episode, env_state = self._require_episode()
+        ready_tasks = self._task_views_for_status(TaskStatus.READY)
+        running_tasks = self._task_views_for_status(TaskStatus.RUNNING)
+        completed_tasks = self._task_views_for_status(TaskStatus.COMPLETED)
+        blocked_tasks = self._task_views_for_status(TaskStatus.BLOCKED)
+        effective_workers = self._effective_worker_capacity(env_state)
+        return WorkflowArenaObservation(
+            done=done,
+            reward=reward,
+            config=self._config,
+            current_time=env_state.current_time,
+            total_workers=self._config.worker_count,
+            effective_workers=effective_workers,
+            degraded_workers=env_state.degraded_workers,
+            free_workers=max(0, effective_workers - len(running_tasks)),
+            time_budget=env_state.time_budget,
+            time_remaining=self._time_remaining(env_state),
+            progress=ProgressSummary(
+                total=len(episode.tasks),
+                blocked=len(blocked_tasks),
+                ready=len(ready_tasks),
+                running=len(running_tasks),
+                completed=len(completed_tasks),
+            ),
+            ready_tasks=ready_tasks,
+            running_tasks=running_tasks,
+            completed_tasks=completed_tasks,
+            blocked_tasks=blocked_tasks,
+            last_reward_breakdown=breakdown,
+            cumulative_reward=self._cumulative_reward,
+            success_metrics=self._success_metrics(
+                benchmark_score_override=benchmark_score_override
+            ),
+            note=note,
+            benchmark_score=benchmark_score_override,
+            recent_failure_events=env_state.recent_failure_events,
+            metadata={
+                "phase": "simulation_active",
+                "note": note,
+                "effective_workers": effective_workers,
+                "degraded_workers": env_state.degraded_workers,
+                "time_budget": env_state.time_budget,
+                "time_remaining": self._time_remaining(env_state),
+                "recent_failure_events": [
+                    event.model_dump(mode="json") for event in env_state.recent_failure_events
+                ],
+                "episode_loop": [
+                    "reset generates a seeded workflow DAG episode",
+                    "dispatch(task_ids=[...]) starts ready tasks if workers are free",
+                    "wait() advances simulated time to the next completion event",
+                    "medium and hard episodes may end at a fixed time budget",
+                    "hard mode may trigger outages and retry failures",
+                ],
+            },
+        )
+    def _lower_bound_makespan(self, episode: WorkflowEpisodeSpec) -> int:
+        total_work = sum(task.duration for task in episode.tasks)
+        work_bound = (total_work + self._config.worker_count - 1) // self._config.worker_count
+        path_bound = max(task.critical_path_length for task in episode.tasks)
+        return max(1, work_bound, path_bound)
+    def _termination_breakdown(
+        self,
+        *,
+        invalid_penalty: float = 0.0,
+        idle_penalty: float = 0.0,
+        terminal_makespan_score: float = 0.0,
+        unfinished_task_penalty: float = 0.0,
+    ) -> RewardBreakdown:
+        return RewardBreakdown(
+            invalid_action_penalty=round(invalid_penalty, 4),
+            idle_penalty=round(idle_penalty, 4),
+            terminal_makespan_score=round(terminal_makespan_score, 4),
+            unfinished_task_penalty=round(unfinished_task_penalty, 4),
+        )
+    def _terminate_episode(
+        self,
+        *,
+        note: str,
+        breakdown: RewardBreakdown,
+        reward: float,
+        reason: str,
+        benchmark_score: float | None = None,
+    ) -> WorkflowArenaObservation:
+        if benchmark_score is None:
+            benchmark_score = self._benchmark_score()
+        self._cumulative_reward += reward
+        observation = self._base_observation(
+            reward=reward,
+            breakdown=breakdown,
+            note=note,
+            done=True,
+            benchmark_score_override=benchmark_score,
+        )
+        observation.termination_reason = reason
+        observation.benchmark_score = benchmark_score
+        observation.metadata["termination_reason"] = reason
+        observation.metadata["benchmark_score"] = benchmark_score
+        return observation
+    def _step_limit_reached(self) -> bool:
+        return self._state.step_count >= self._max_episode_steps
+    def _maybe_terminate_for_limits(self) -> WorkflowArenaObservation | None:
+        if not self._step_limit_reached():
+            return None
+        _, env_state = self._require_episode()
+        unfinished_penalty = self._unfinished_task_penalty(env_state.current_time)
+        terminal_score = self._terminal_score()
+        breakdown = self._termination_breakdown(
+            terminal_makespan_score=terminal_score,
+            unfinished_task_penalty=unfinished_penalty,
+        )
+        reward = round(-1.0 + unfinished_penalty + terminal_score, 4)
+        return self._terminate_episode(
+            note="Episode terminated after hitting the safety step limit.",
+            breakdown=breakdown,
+            reward=reward,
+            reason="step_limit",
+        )
+    def _apply_invalid(
+        self,
+        message: str,
+        *,
+        penalty: float | None = None,
+    ) -> WorkflowArenaObservation:
+        _, env_state = self._require_episode()
+        applied_penalty = (
+            self.INVALID_ACTION_PENALTY if penalty is None else float(penalty)
+        )
+        breakdown = RewardBreakdown(invalid_action_penalty=round(applied_penalty, 4))
+        self._cumulative_reward += breakdown.invalid_action_penalty
+        self._set_recent_failure_events(env_state, [])
+        observation = self._base_observation(
+            reward=breakdown.invalid_action_penalty,
+            breakdown=breakdown,
+            note="Invalid action.",
+            done=False,
+        )
+        observation.validation_error = message
+        observation.metadata["validation_error"] = message
+        return observation
+    def _transition_unlocks(self, completed_task_ids: list[str]) -> list[str]:
+        episode, env_state = self._require_episode()
+        task_map = {task.task_id: task for task in episode.tasks}
+        unlocked: list[str] = []
+        for task_id in completed_task_ids:
+            for dependent_id in task_map[task_id].dependents:
+                env_state.task_remaining_dependencies[dependent_id] -= 1
+                if env_state.task_remaining_dependencies[dependent_id] == 0:
+                    env_state.task_statuses[dependent_id] = TaskStatus.READY
+                    if dependent_id not in env_state.ready_task_ids:
+                        env_state.ready_task_ids.append(dependent_id)
+                    if dependent_id in env_state.blocked_task_ids:
+                        env_state.blocked_task_ids.remove(dependent_id)
+                    unlocked.append(dependent_id)
+        env_state.ready_task_ids.sort()
+        env_state.blocked_task_ids.sort()
+        return unlocked
+    def reset(
+        self,
+        seed: int | None = None,
+        episode_id: str | None = None,
+        **kwargs: Any,
+    ) -> WorkflowArenaObservation:
+        """Generate a seeded workflow DAG episode."""
+        preset_raw = kwargs.pop("preset", DifficultyPreset.EASY)
+        worker_count_raw = kwargs.pop("worker_count", None)
+        del kwargs
+        preset = (
+            preset_raw
+            if isinstance(preset_raw, DifficultyPreset)
+            else DifficultyPreset(str(preset_raw))
+        )
+        preset_config = get_preset_config(preset)
+        chosen_seed = 0 if seed is None else seed
+        chosen_worker_count = (
+            preset_config.worker_count
+            if worker_count_raw is None
+            else int(worker_count_raw)
+        )
+        chosen_episode_id = str(uuid4()) if episode_id is None else episode_id
+        self._state = State(episode_id=chosen_episode_id, step_count=0)
+        self._cumulative_reward = 0.0
+        self._config = EpisodeConfig(
+            preset=preset,
+            seed=chosen_seed,
+            worker_count=chosen_worker_count,
+        )
+        self._event_rng = random.Random(
+            (chosen_seed + 1) * 1009
+            + (chosen_worker_count * 131)
+            + (list(DifficultyPreset).index(preset) + 1)
+        )
+        self._episode_spec, self._env_state = generate_episode(self._config)
+        self._max_episode_steps = max(
+            self.STEP_LIMIT_FLOOR,
+            len(self._episode_spec.tasks) * self.STEP_LIMIT_MULTIPLIER,
+        )
+        self._env_state.episode_id = chosen_episode_id
+        lower_bound = self._lower_bound_makespan(self._episode_spec)
+        if preset_config.time_budget_multiplier is not None:
+            self._env_state.time_budget = int(
+                math.ceil(lower_bound * preset_config.time_budget_multiplier)
+            )
+        self._set_recent_failure_events(self._env_state, [])
+        note = "Workflow episode generated. Dispatch ready tasks or wait for completions."
+        if self._env_state.time_budget is not None:
+            note = (
+                f"Workflow episode generated. Finish as much as possible before "
+                f"t={self._env_state.time_budget}."
+            )
+        if preset == DifficultyPreset.HARD:
+            note += " Hard mode may trigger worker outages and retry failures."
+        return self._base_observation(
+            reward=0.0,
+            breakdown=RewardBreakdown(),
+            note=note,
+            done=False,
+        )
+    def _wait_note(
+        self,
+        *,
+        completed_now: list[str],
+        failed_now: list[str],
+        unlocked: list[str],
+        recent_events: list[WorkflowFailureEvent],
+        time_budget_hit: bool = False,
+    ) -> str:
+        chunks: list[str] = []
+        if time_budget_hit:
+            chunks.append("Time budget exhausted before the next completion event.")
+        elif completed_now:
+            chunks.append(f"Completed: {', '.join(completed_now)}.")
+        else:
+            chunks.append("Advanced to next completion event.")
+        if failed_now:
+            chunks.append(f"Retry required: {', '.join(failed_now)}.")
+        if unlocked:
+            chunks.append(f"Unlocked: {', '.join(unlocked)}.")
+        for event in recent_events:
+            if event.event_type == FailureEventType.WORKER_OUTAGE_START:
+                chunks.append(event.detail)
+            elif event.event_type == FailureEventType.WORKER_OUTAGE_END:
+                chunks.append("Worker capacity restored.")
+        return " ".join(chunks)
+    def step(
+        self,
+        action: WorkflowArenaAction,
+        timeout_s: float | None = None,
+        **kwargs: Any,
+    ) -> WorkflowArenaObservation:
+        """Apply a dispatch or wait action using event-driven semantics."""
+        del timeout_s, kwargs
+        episode, env_state = self._require_episode()
+        task_map = {task.task_id: task for task in episode.tasks}
+        self._state.step_count += 1
+        self._set_recent_failure_events(env_state, [])
+        limit_termination = self._maybe_terminate_for_limits()
+        if limit_termination is not None:
+            return limit_termination
+        if action.action_type == WorkflowActionType.WAIT and action.task_ids:
+            return self._apply_invalid("wait() must not include task_ids.")
+        if action.action_type == WorkflowActionType.DISPATCH:
+            if not action.task_ids:
+                return self._apply_invalid(
+                    "dispatch(task_ids=[...]) requires at least one task id."
+                )
+            if len(set(action.task_ids)) != len(action.task_ids):
+                return self._apply_invalid(
+                    "dispatch(task_ids=[...]) must not contain duplicate task ids."
+                )
+            free_workers = self._effective_worker_capacity(env_state) - len(
+                env_state.running_task_ids
+            )
+            if len(action.task_ids) > max(0, free_workers):
+                return self._apply_invalid(
+                    "dispatch(task_ids=[...]) cannot exceed available worker capacity.",
+                    penalty=self.OVERCAPACITY_INVALID_ACTION_PENALTY,
+                )
+            unknown_tasks = [task_id for task_id in action.task_ids if task_id not in task_map]
+            if unknown_tasks:
+                return self._apply_invalid(f"Unknown task ids: {unknown_tasks}.")
+            not_ready = [
+                task_id
+                for task_id in action.task_ids
+                if env_state.task_statuses[task_id] != TaskStatus.READY
+            ]
+            if not_ready:
+                return self._apply_invalid(
+                    f"Only ready tasks can be dispatched: {not_ready}."
+                )
+            prev_utilization_potential, prev_criticality_potential = self._dispatch_potential(
+                env_state, task_map
+            )
+            for task_id in action.task_ids:
+                task = task_map[task_id]
+                env_state.task_statuses[task_id] = TaskStatus.RUNNING
+                env_state.task_start_times[task_id] = env_state.current_time
+                env_state.task_assigned_finish_times[task_id] = (
+                    env_state.current_time + task.duration
+                )
+                env_state.running_task_ids.append(task_id)
+                env_state.ready_task_ids.remove(task_id)
+            env_state.running_task_ids.sort()
+            next_utilization_potential, next_criticality_potential = self._dispatch_potential(
+                env_state, task_map
+            )
+            breakdown = RewardBreakdown(
+                utilization_reward=round(
+                    next_utilization_potential - prev_utilization_potential, 4
+                ),
+                criticality_reward=round(
+                    next_criticality_potential - prev_criticality_potential, 4
+                ),
+            )
+            reward = round(
+                breakdown.utilization_reward + breakdown.criticality_reward,
+                4,
+            )
+            self._cumulative_reward += reward
+            observation = self._base_observation(
+                reward=reward,
+                breakdown=breakdown,
+                note="Tasks dispatched. Use wait() to advance to the next completion event.",
+                done=False,
+            )
+            observation.received_action = action.model_dump(mode="json")
+            observation.metadata["received_action"] = action.model_dump(mode="json")
+            return observation
+        if not env_state.running_task_ids:
+            return self._apply_invalid("wait() requires at least one running task.")
+        recent_events: list[WorkflowFailureEvent] = []
+        avoidable_wait_penalty = 0.0
+        if env_state.ready_task_ids:
+            free_workers = self._effective_worker_capacity(env_state) - len(
+                env_state.running_task_ids
+            )
+            if free_workers > 0:
+                avoidable_wait_penalty = self.AVOIDABLE_WAIT_PENALTY_PER_SLOT * min(
+                    free_workers,
+                    len(env_state.ready_task_ids),
+                )
+        self._maybe_start_worker_outage(env_state, recent_events)
+        next_completion_time = min(
+            env_state.task_assigned_finish_times[task_id]
+            for task_id in env_state.running_task_ids
+        )
+        target_time = next_completion_time
+        budget_hit_before_completion = False
+        if env_state.time_budget is not None and env_state.time_budget < next_completion_time:
+            target_time = env_state.time_budget
+            budget_hit_before_completion = True
+        elapsed = target_time - env_state.current_time
+        env_state.cumulative_busy_time += elapsed * len(env_state.running_task_ids)
+        env_state.current_time = target_time
+        self._maybe_end_worker_outage(env_state, recent_events)
+        if budget_hit_before_completion:
+            unfinished_penalty = self._unfinished_task_penalty(env_state.current_time)
+            terminal_score = self._terminal_score()
+            breakdown = RewardBreakdown(
+                idle_penalty=round(avoidable_wait_penalty, 4),
+                terminal_makespan_score=round(terminal_score, 4),
+                unfinished_task_penalty=round(unfinished_penalty, 4),
+            )
+            reward = round(
+                breakdown.idle_penalty
+                + breakdown.terminal_makespan_score
+                + breakdown.unfinished_task_penalty,
+                4,
+            )
+            self._set_recent_failure_events(env_state, recent_events)
+            note = self._wait_note(
+                completed_now=[],
+                failed_now=[],
+                unlocked=[],
+                recent_events=recent_events,
+                time_budget_hit=True,
+            )
+            observation = self._terminate_episode(
+                note=note,
+                breakdown=breakdown,
+                reward=reward,
+                reason="time_budget",
+            )
+            observation.received_action = action.model_dump(mode="json")
+            observation.metadata["received_action"] = action.model_dump(mode="json")
+            return observation
+        completed_candidates = sorted(
+            [
+                task_id
+                for task_id in env_state.running_task_ids
+                if env_state.task_assigned_finish_times[task_id] == next_completion_time
+            ]
+        )
+        completed_now: list[str] = []
+        failed_now: list[str] = []
+        for task_id in completed_candidates:
+            env_state.running_task_ids.remove(task_id)
+            del env_state.task_assigned_finish_times[task_id]
+            if self._should_retry_fail(task_id):
+                env_state.task_attempt_counts[task_id] += 1
+                env_state.task_statuses[task_id] = TaskStatus.READY
+                env_state.task_start_times.pop(task_id, None)
+                env_state.task_end_times.pop(task_id, None)
+                if task_id not in env_state.ready_task_ids:
+                    env_state.ready_task_ids.append(task_id)
+                failed_now.append(task_id)
+                recent_events.append(
+                    WorkflowFailureEvent(
+                        event_type=FailureEventType.TASK_RETRY_FAILURE,
+                        time=next_completion_time,
+                        task_id=task_id,
+                        detail=f"{task_id} failed and returned to ready.",
+                    )
+                )
+            else:
+                env_state.task_statuses[task_id] = TaskStatus.COMPLETED
+                env_state.task_end_times[task_id] = next_completion_time
+                env_state.completed_task_ids.append(task_id)
+                completed_now.append(task_id)
+        env_state.completed_task_ids.sort()
+        env_state.ready_task_ids.sort()
+        unlocked = self._transition_unlocks(completed_now)
+        completion_reward = sum(
+            0.04 + 0.01 * task_map[task_id].priority for task_id in completed_now
+        )
+        deadline_reward = 0.0
+        criticality_reward = 0.0
+        for task_id in completed_now:
+            task = task_map[task_id]
+            if task.deadline is not None:
+                lateness = next_completion_time - task.deadline
+                deadline_reward += 0.05 if lateness <= 0 else -0.02 * lateness
+            criticality_reward += 0.03 * task.criticality
+        utilization_reward = 0.06 * (
+            elapsed
+            * (len(completed_candidates) + len(env_state.running_task_ids))
+            / max(1, self._config.worker_count)
+        )
+        idle_penalty = 0.0
+        if not env_state.running_task_ids and env_state.ready_task_ids:
+            idle_penalty = -0.03 * len(env_state.ready_task_ids)
+        done = len(env_state.completed_task_ids) == len(episode.tasks)
+        breakdown = RewardBreakdown(
+            completion_reward=round(completion_reward, 4),
+            utilization_reward=round(utilization_reward, 4),
+            deadline_reward=round(deadline_reward, 4),
+            criticality_reward=round(criticality_reward, 4),
+            idle_penalty=round(idle_penalty + avoidable_wait_penalty, 4),
+            terminal_makespan_score=round(self._terminal_score() if done else 0.0, 4),
+        )
+        reward = round(
+            breakdown.completion_reward
+            + breakdown.utilization_reward
+            + breakdown.deadline_reward
+            + breakdown.criticality_reward
+            + breakdown.idle_penalty
+            + breakdown.terminal_makespan_score,
+            4,
+        )
+        budget_exhausted_now = (
+            not done
+            and env_state.time_budget is not None
+            and env_state.current_time >= env_state.time_budget
+        )
+        if budget_exhausted_now:
+            unfinished_penalty = self._unfinished_task_penalty(env_state.current_time)
+            breakdown.unfinished_task_penalty = round(unfinished_penalty, 4)
+            breakdown.terminal_makespan_score = round(self._terminal_score(), 4)
+            reward = round(
+                reward
+                + breakdown.unfinished_task_penalty
+                + breakdown.terminal_makespan_score,
+                4,
+            )
+            self._set_recent_failure_events(env_state, recent_events)
+            note = self._wait_note(
+                completed_now=completed_now,
+                failed_now=failed_now,
+                unlocked=unlocked,
+                recent_events=recent_events,
+                time_budget_hit=True,
+            )
+            observation = self._terminate_episode(
+                note=note,
+                breakdown=breakdown,
+                reward=reward,
+                reason="time_budget",
+            )
+            observation.received_action = action.model_dump(mode="json")
+            observation.metadata["received_action"] = action.model_dump(mode="json")
+            observation.metadata["completed_now"] = completed_now
+            observation.metadata["unlocked_now"] = unlocked
+            observation.metadata["failed_now"] = failed_now
+            return observation
+        self._cumulative_reward += reward
+        self._set_recent_failure_events(env_state, recent_events)
+        observation = self._base_observation(
+            reward=reward,
+            breakdown=breakdown,
+            note=self._wait_note(
+                completed_now=completed_now,
+                failed_now=failed_now,
+                unlocked=unlocked,
+                recent_events=recent_events,
+            ),
+            done=done,
+            benchmark_score_override=self._benchmark_score() if done else None,
+        )
+        observation.received_action = action.model_dump(mode="json")
+        observation.metadata["received_action"] = action.model_dump(mode="json")
+        observation.metadata["completed_now"] = completed_now
+        observation.metadata["unlocked_now"] = unlocked
+        observation.metadata["failed_now"] = failed_now
+        if done:
+            observation.benchmark_score = observation.success_metrics.benchmark_score
+        return observation
+    @property
+    def state(self) -> State:
+        """Expose generic OpenEnv state metadata."""
+        return self._state

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff