Spaces:
Paused
Paused
| name: sysadmin-env | |
| version: "0.2.0" | |
| description: reinforcement learning environment for linux server auto remediation | |
| runtime: | |
| python: "3.11" | |
| entry_point: inference.py | |
| server_entry_point: server.app:app | |
| live_url: https://huggingmenfordays-enterprise-hpc-openenv.hf.space | |
| reset_endpoint: /reset | |
| step_endpoint: /step | |
| state_endpoint: /state | |
| websocket_endpoint: /ws | |
| healthcheck_endpoint: /health | |
| tasks_endpoint: /tasks | |
| resources: | |
| vcpus: 2 | |
| memory_gb: 8 | |
| gpu: none | |
| max_runtime_minutes: 20 | |
| tasks: | |
| # warm-up curriculum tier (round 1 legacy): single-app remediations | |
| # used as a difficulty ramp so a freshly initialized policy can | |
| # accumulate non-zero reward before the multi-app hpc scenarios kick | |
| # in. not the story of the round 2 submission. | |
| - id: nginx_crash | |
| tier: warmup | |
| difficulty: easy | |
| description: nginx crash with stale pid and config syntax error (warm-up tier) | |
| max_steps: 40 | |
| time_limit_seconds: 300 | |
| - id: disk_full | |
| tier: warmup | |
| difficulty: medium | |
| description: hidden sparse log file filling a loopback mount (warm-up tier) | |
| max_steps: 55 | |
| time_limit_seconds: 420 | |
| - id: network_broken | |
| tier: warmup | |
| difficulty: hard | |
| description: broken network namespace with corrupted routing tables (warm-up tier) | |
| max_steps: 70 | |
| time_limit_seconds: 480 | |
| # round 2 hpc tier: multi-app enterprise incident response scenarios. | |
| # this is the tier the grpo trainer samples from by default and the | |
| # tier judges should score on for theme #3.1 (scaler ai labs multi-app | |
| # rl environment for enterprise workflows). | |
| - id: hpc_outage | |
| tier: hpc | |
| difficulty: hard | |
| description: multi node hpc cluster outage with drained compute and broken ood portal | |
| max_steps: 90 | |
| time_limit_seconds: 600 | |
| - id: hpc_munge | |
| tier: hpc | |
| difficulty: hard | |
| description: compute node draining due to a munge key permission fault and broken route | |
| max_steps: 90 | |
| time_limit_seconds: 600 | |
| - id: hpc_pid_stale | |
| tier: hpc | |
| difficulty: hard | |
| description: slurmd refuses to restart after reboot because a stale pid file is still on disk | |
| max_steps: 90 | |
| time_limit_seconds: 600 | |
| - id: hpc_gpu_ecc | |
| tier: hpc | |
| difficulty: hard | |
| description: compute node drained because nvidia-smi reports gpu-0 uncorrectable ecc errors | |
| max_steps: 90 | |
| time_limit_seconds: 600 | |
| - id: hpc_nfs_stale | |
| tier: hpc | |
| difficulty: hard | |
| description: compute node drained because the nfs share at /mnt/shared reports stale file handle | |
| max_steps: 90 | |
| time_limit_seconds: 600 | |
| - id: hpc_ood_apache | |
| tier: hpc | |
| difficulty: medium | |
| description: open ondemand apache portal on :8081 returns 500 due to a one character typo in httpd.conf | |
| max_steps: 80 | |
| time_limit_seconds: 540 | |
| evaluation: | |
| protocol: sequential | |
| max_total_runtime_seconds: 4200 | |
| tasks_order: | |
| - nginx_crash | |
| - disk_full | |
| - network_broken | |
| - hpc_outage | |
| - hpc_munge | |
| - hpc_pid_stale | |
| - hpc_gpu_ecc | |
| - hpc_nfs_stale | |
| - hpc_ood_apache | |