Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- Dockerfile +80 -0
- README.md +549 -5
- __init__.py +72 -0
- client.py +123 -0
- models.py +92 -0
- openenv.yaml +5 -0
- pyproject.toml +33 -0
- server/__init__.py +1 -0
- server/app.py +42 -0
- server/browsergym_environment.py +303 -0
- server/requirements.txt +9 -0
- server/start.sh +29 -0
Dockerfile
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
ARG BASE_IMAGE=openenv-base:latest
|
| 2 |
+
FROM ${BASE_IMAGE}
|
| 3 |
+
|
| 4 |
+
# Install system dependencies for Playwright and browsers
|
| 5 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 6 |
+
# Playwright browser dependencies
|
| 7 |
+
libnss3 \
|
| 8 |
+
libnspr4 \
|
| 9 |
+
libatk1.0-0 \
|
| 10 |
+
libatk-bridge2.0-0 \
|
| 11 |
+
libcups2 \
|
| 12 |
+
libdrm2 \
|
| 13 |
+
libdbus-1-3 \
|
| 14 |
+
libxkbcommon0 \
|
| 15 |
+
libatspi2.0-0 \
|
| 16 |
+
libxcomposite1 \
|
| 17 |
+
libxdamage1 \
|
| 18 |
+
libxfixes3 \
|
| 19 |
+
libxrandr2 \
|
| 20 |
+
libgbm1 \
|
| 21 |
+
libpango-1.0-0 \
|
| 22 |
+
libcairo2 \
|
| 23 |
+
libasound2 \
|
| 24 |
+
libxshmfence1 \
|
| 25 |
+
fonts-unifont \
|
| 26 |
+
fonts-noto-color-emoji \
|
| 27 |
+
# Additional dependencies
|
| 28 |
+
git \
|
| 29 |
+
wget \
|
| 30 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 31 |
+
|
| 32 |
+
# Install BrowserGym and related packages
|
| 33 |
+
COPY src/envs/browsergym_env/server/requirements.txt /tmp/browsergym_requirements.txt
|
| 34 |
+
RUN pip install --no-cache-dir -r /tmp/browsergym_requirements.txt && \
|
| 35 |
+
rm /tmp/browsergym_requirements.txt
|
| 36 |
+
|
| 37 |
+
# Install Playwright browsers (Chromium by default)
|
| 38 |
+
RUN playwright install chromium
|
| 39 |
+
|
| 40 |
+
# Install MiniWoB++ tasks
|
| 41 |
+
RUN pip install browsergym-miniwob
|
| 42 |
+
RUN git clone --depth 1 https://github.com/Farama-Foundation/miniwob-plusplus.git /app/miniwob-plusplus
|
| 43 |
+
|
| 44 |
+
# Copy OpenEnv core and browsergym_env code
|
| 45 |
+
WORKDIR /app
|
| 46 |
+
COPY src/core/ /app/src/core/
|
| 47 |
+
COPY src/envs/browsergym_env/ /app/src/envs/browsergym_env/
|
| 48 |
+
COPY src/envs/browsergym_env/README.md /app/README.md
|
| 49 |
+
RUN chmod +x /app/src/envs/browsergym_env/server/start.sh
|
| 50 |
+
|
| 51 |
+
# Set environment variables
|
| 52 |
+
ENV PYTHONPATH=/app/src
|
| 53 |
+
ENV PYTHONUNBUFFERED=1
|
| 54 |
+
ENV BROWSERGYM_BENCHMARK=miniwob
|
| 55 |
+
ENV BROWSERGYM_TASK_NAME="click-test"
|
| 56 |
+
ENV BROWSERGYM_HEADLESS=true
|
| 57 |
+
ENV BROWSERGYM_VIEWPORT_WIDTH=1280
|
| 58 |
+
ENV BROWSERGYM_VIEWPORT_HEIGHT=720
|
| 59 |
+
ENV BROWSERGYM_TIMEOUT=10000
|
| 60 |
+
ENV MINIWOB_HTML_DIR=/app/miniwob-plusplus/miniwob/html
|
| 61 |
+
ENV MINIWOB_HTTP_PORT=8888
|
| 62 |
+
ENV MINIWOB_URL=http://127.0.0.1:8888/miniwob/
|
| 63 |
+
|
| 64 |
+
# For WebArena tasks, these should be set by the user when running the container:
|
| 65 |
+
# ENV SHOPPING=
|
| 66 |
+
# ENV SHOPPING_ADMIN=
|
| 67 |
+
# ENV REDDIT=
|
| 68 |
+
# ENV GITLAB=
|
| 69 |
+
# ENV MAP=
|
| 70 |
+
# ENV WIKIPEDIA=
|
| 71 |
+
# ENV HOMEPAGE=
|
| 72 |
+
|
| 73 |
+
EXPOSE 8000
|
| 74 |
+
EXPOSE 8888
|
| 75 |
+
|
| 76 |
+
HEALTHCHECK --interval=30s --timeout=3s --start-period=10s --retries=3 \
|
| 77 |
+
CMD curl -f http://localhost:8000/health || exit 1
|
| 78 |
+
|
| 79 |
+
ENV ENABLE_WEB_INTERFACE=true
|
| 80 |
+
CMD ["/app/src/envs/browsergym_env/server/start.sh"]
|
README.md
CHANGED
|
@@ -1,10 +1,554 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: docker
|
| 7 |
pinned: false
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
---
|
| 9 |
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: BrowserGym Environment Server
|
| 3 |
+
emoji: 🌐
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: purple
|
| 6 |
sdk: docker
|
| 7 |
pinned: false
|
| 8 |
+
app_port: 8000
|
| 9 |
+
base_path: /web
|
| 10 |
+
tags:
|
| 11 |
+
- openenv
|
| 12 |
+
- browsergym
|
| 13 |
+
- web-automation
|
| 14 |
+
- reinforcement-learning
|
| 15 |
---
|
| 16 |
|
| 17 |
+
# BrowserGym Environment
|
| 18 |
+
|
| 19 |
+
BrowserGym is a unified framework for web-based agent tasks that provides access to multiple benchmarks under a single Gymnasium-compatible API. This integration brings the complete training-to-evaluation pipeline for web agents into OpenEnv.
|
| 20 |
+
|
| 21 |
+
## Why BrowserGym?
|
| 22 |
+
|
| 23 |
+
BrowserGym provides a complete pipeline for developing web agents: train on simple tasks, then evaluate on realistic websites.
|
| 24 |
+
|
| 25 |
+
**What are these benchmarks?**
|
| 26 |
+
|
| 27 |
+
- **MiniWoB++ (Training)**: 100+ synthetic web tasks like "click this button", "fill out this form", "select from dropdown". Each task is a simple webpage with a clear objective. Fast resets, randomized variations, dense rewards. Perfect for learning basic web navigation skills. **No external setup needed** - tasks run in isolated browser sessions.
|
| 28 |
+
|
| 29 |
+
- **WebArena (Evaluation)**: 812 tasks on real websites (e-commerce, forums, GitLab, Wikipedia). Tasks like "find the cheapest laptop and add to cart" or "create a merge request for bug #123". Multistep, requires reasoning, sparse rewards. Tests if your agent can handle actual websites. **Requires running 7 backend services** (shopping site, GitLab instance, etc.).
|
| 30 |
+
|
| 31 |
+
- **VisualWebArena**: Similar to WebArena but requires visual understanding - agents need to interpret images, identify UI elements visually, handle multimodal content.
|
| 32 |
+
|
| 33 |
+
- **WorkArena**: Enterprise software tasks (CRM, project management, business workflows). Tests automation on corporate-style applications.
|
| 34 |
+
|
| 35 |
+
**The training → evaluation pipeline:**
|
| 36 |
+
1. Train on MiniWoB (simple, controlled, fast iterations)
|
| 37 |
+
2. Evaluate on WebArena (complex, realistic, measures real-world capability)
|
| 38 |
+
|
| 39 |
+
**Key advantage**: You can start training immediately with MiniWoB. No need to set up infrastructure just to test if your code works.
|
| 40 |
+
|
| 41 |
+
## Quick Start - Training (MiniWoB)
|
| 42 |
+
|
| 43 |
+
### No Setup Required! 🎉
|
| 44 |
+
|
| 45 |
+
```python
|
| 46 |
+
from envs.browsergym_env import BrowserGymEnv, BrowserGymAction
|
| 47 |
+
|
| 48 |
+
# Create environment for MiniWoB training task
|
| 49 |
+
env = BrowserGymEnv.from_docker_image(
|
| 50 |
+
"ghcr.io/openenv/browsergym-env:latest",
|
| 51 |
+
environment={
|
| 52 |
+
"BROWSERGYM_BENCHMARK": "miniwob",
|
| 53 |
+
"BROWSERGYM_TASK_NAME": "click-test", # or "click-button", "click-dialog", etc.
|
| 54 |
+
}
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
# Train your agent!
|
| 58 |
+
for episode in range(1000):
|
| 59 |
+
result = env.reset()
|
| 60 |
+
print(f"Goal: {result.observation.goal}")
|
| 61 |
+
|
| 62 |
+
done = False
|
| 63 |
+
while not done:
|
| 64 |
+
# Your agent decides what to do
|
| 65 |
+
action_str = agent.get_action(result.observation.text)
|
| 66 |
+
action = BrowserGymAction(action_str=action_str)
|
| 67 |
+
|
| 68 |
+
result = env.step(action)
|
| 69 |
+
done = result.done
|
| 70 |
+
|
| 71 |
+
print(f"Reward: {result.reward}")
|
| 72 |
+
|
| 73 |
+
env.close()
|
| 74 |
+
```
|
| 75 |
+
|
| 76 |
+
### Available Tasks by Benchmark
|
| 77 |
+
|
| 78 |
+
#### MiniWoB++ Tasks (Training - 100+ tasks)
|
| 79 |
+
|
| 80 |
+
MiniWoB tasks are organized by difficulty and type. Here are the main categories:
|
| 81 |
+
|
| 82 |
+
**Click Tasks** (Basic interaction)
|
| 83 |
+
| Task Name | Description | Difficulty |
|
| 84 |
+
|-----------|-------------|------------|
|
| 85 |
+
| `click-test` | Click a single button | ⭐ Easy |
|
| 86 |
+
| `click-button` | Click button with specific text | ⭐ Easy |
|
| 87 |
+
| `click-button-sequence` | Click buttons in order | ⭐⭐ Medium |
|
| 88 |
+
| `click-checkboxes` | Select specific checkboxes | ⭐⭐ Medium |
|
| 89 |
+
| `click-checkboxes-soft` | Select checkboxes (multiple valid) | ⭐⭐ Medium |
|
| 90 |
+
| `click-checkboxes-large` | Many checkboxes to select from | ⭐⭐ Medium |
|
| 91 |
+
| `click-checkboxes-transfer` | Transfer learning variation | ⭐⭐ Medium |
|
| 92 |
+
| `click-dialog` | Click correct button in dialog | ⭐ Easy |
|
| 93 |
+
| `click-dialog-2` | More complex dialog | ⭐⭐ Medium |
|
| 94 |
+
| `click-link` | Click on a link | ⭐ Easy |
|
| 95 |
+
| `click-option` | Select from dropdown | ⭐⭐ Medium |
|
| 96 |
+
| `click-pie` | Click on pie chart slice | ⭐⭐ Medium |
|
| 97 |
+
| `click-scroll-list` | Click item in scrollable list | ⭐⭐⭐ Hard |
|
| 98 |
+
| `click-shades` | Click on specific color shade | ⭐⭐ Medium |
|
| 99 |
+
| `click-shape` | Click on specific shape | ⭐⭐ Medium |
|
| 100 |
+
| `click-tab` | Switch between tabs | ⭐⭐ Medium |
|
| 101 |
+
| `click-tab-2` | More complex tab switching | ⭐⭐⭐ Hard |
|
| 102 |
+
| `click-widget` | Click on UI widget | ⭐⭐ Medium |
|
| 103 |
+
|
| 104 |
+
**Text Entry Tasks** (Typing and forms)
|
| 105 |
+
| Task Name | Description | Difficulty |
|
| 106 |
+
|-----------|-------------|------------|
|
| 107 |
+
| `enter-text` | Type text into input field | ⭐ Easy |
|
| 108 |
+
| `enter-text-dynamic` | Dynamic text entry | ⭐⭐ Medium |
|
| 109 |
+
| `enter-text-2` | Multiple text fields | ⭐⭐ Medium |
|
| 110 |
+
| `enter-password` | Fill password field | ⭐ Easy |
|
| 111 |
+
| `enter-date` | Enter a date | ⭐⭐ Medium |
|
| 112 |
+
| `enter-time` | Enter a time | ⭐⭐ Medium |
|
| 113 |
+
| `login-user` | Complete login form | ⭐⭐ Medium |
|
| 114 |
+
| `login-user-popup` | Login via popup | ⭐⭐⭐ Hard |
|
| 115 |
+
|
| 116 |
+
**Navigation Tasks** (Multi-step interaction)
|
| 117 |
+
| Task Name | Description | Difficulty |
|
| 118 |
+
|-----------|-------------|------------|
|
| 119 |
+
| `navigate-tree` | Navigate through tree structure | ⭐⭐⭐ Hard |
|
| 120 |
+
| `search-engine` | Use search interface | ⭐⭐ Medium |
|
| 121 |
+
| `use-autocomplete` | Interact with autocomplete | ⭐⭐⭐ Hard |
|
| 122 |
+
| `book-flight` | Book a flight (complex form) | ⭐⭐⭐⭐ Very Hard |
|
| 123 |
+
| `choose-date` | Pick date from calendar | ⭐⭐⭐ Hard |
|
| 124 |
+
| `choose-date-easy` | Simplified date picker | ⭐⭐ Medium |
|
| 125 |
+
| `choose-date-medium` | Medium difficulty date picker | ⭐⭐⭐ Hard |
|
| 126 |
+
| `choose-list` | Select from long list | ⭐⭐ Medium |
|
| 127 |
+
|
| 128 |
+
**Visual/Spatial Tasks** (Requires visual understanding)
|
| 129 |
+
| Task Name | Description | Difficulty |
|
| 130 |
+
|-----------|-------------|------------|
|
| 131 |
+
| `count-sides` | Count sides of shape | ⭐⭐ Medium |
|
| 132 |
+
| `count-shape` | Count specific shapes | ⭐⭐ Medium |
|
| 133 |
+
| `find-word` | Find word in text | ⭐⭐ Medium |
|
| 134 |
+
| `focus-text` | Focus on text element | ⭐ Easy |
|
| 135 |
+
| `focus-text-2` | More complex focus task | ⭐⭐ Medium |
|
| 136 |
+
| `grid-coordinate` | Click grid coordinate | ⭐⭐ Medium |
|
| 137 |
+
| `guess-number` | Guess a number game | ⭐⭐⭐ Hard |
|
| 138 |
+
| `identify-shape` | Identify shape type | ⭐⭐ Medium |
|
| 139 |
+
| `read-table` | Extract info from table | ⭐⭐⭐ Hard |
|
| 140 |
+
| `read-table-2` | More complex table reading | ⭐⭐⭐ Hard |
|
| 141 |
+
|
| 142 |
+
**Email/Social Tasks** (Realistic scenarios)
|
| 143 |
+
| Task Name | Description | Difficulty |
|
| 144 |
+
|-----------|-------------|------------|
|
| 145 |
+
| `email-inbox` | Manage email inbox | ⭐⭐⭐⭐ Very Hard |
|
| 146 |
+
| `email-inbox-forward` | Forward emails | ⭐⭐⭐⭐ Very Hard |
|
| 147 |
+
| `email-inbox-nl` | Natural language email task | ⭐⭐⭐⭐ Very Hard |
|
| 148 |
+
| `email-inbox-star-reply` | Star and reply to emails | ⭐⭐⭐⭐ Very Hard |
|
| 149 |
+
| `social-media` | Social media interaction | ⭐⭐⭐⭐ Very Hard |
|
| 150 |
+
| `social-media-some` | Partial social media task | ⭐⭐⭐ Hard |
|
| 151 |
+
|
| 152 |
+
**Total:** 100+ tasks across all categories
|
| 153 |
+
|
| 154 |
+
**Usage:**
|
| 155 |
+
```python
|
| 156 |
+
# Easy task for quick testing
|
| 157 |
+
env = BrowserGymEnv(environment={"BROWSERGYM_TASK_NAME": "click-test"})
|
| 158 |
+
|
| 159 |
+
# Medium difficulty for training
|
| 160 |
+
env = BrowserGymEnv(environment={"BROWSERGYM_TASK_NAME": "click-checkboxes"})
|
| 161 |
+
|
| 162 |
+
# Hard task for evaluation
|
| 163 |
+
env = BrowserGymEnv(environment={"BROWSERGYM_TASK_NAME": "email-inbox"})
|
| 164 |
+
```
|
| 165 |
+
|
| 166 |
+
#### WebArena Tasks (Evaluation - 812 tasks)
|
| 167 |
+
|
| 168 |
+
WebArena tasks are organized by website and difficulty. Tasks are numbered 0-811.
|
| 169 |
+
|
| 170 |
+
**By Website:**
|
| 171 |
+
| Website | Task Count | Description | Example Tasks |
|
| 172 |
+
|---------|------------|-------------|---------------|
|
| 173 |
+
| Shopping | ~200 | E-commerce site | Search products, add to cart, checkout |
|
| 174 |
+
| Shopping Admin | ~150 | Admin panel | Manage products, orders, customers |
|
| 175 |
+
| Reddit | ~150 | Forum/social | Post, comment, search discussions |
|
| 176 |
+
| GitLab | ~200 | Code repository | Create issues, merge requests, review code |
|
| 177 |
+
| Wikipedia | ~100 | Knowledge base | Search, read, extract information |
|
| 178 |
+
| Map | ~12 | Location service | Find places, get directions |
|
| 179 |
+
|
| 180 |
+
**By Difficulty:**
|
| 181 |
+
| Difficulty | Task Count | Steps Required | Example |
|
| 182 |
+
|------------|------------|----------------|---------|
|
| 183 |
+
| Easy | ~200 | 1-5 steps | "Find the price of product X" |
|
| 184 |
+
| Medium | ~400 | 5-15 steps | "Add cheapest laptop to cart" |
|
| 185 |
+
| Hard | ~212 | 15+ steps | "Create merge request for bug fix" |
|
| 186 |
+
|
| 187 |
+
**Usage:**
|
| 188 |
+
```python
|
| 189 |
+
# Task 0 (usually easy)
|
| 190 |
+
env = BrowserGymEnv(environment={
|
| 191 |
+
"BROWSERGYM_BENCHMARK": "webarena",
|
| 192 |
+
"BROWSERGYM_TASK_NAME": "0",
|
| 193 |
+
"SHOPPING": "http://your-server:7770",
|
| 194 |
+
# ... other URLs
|
| 195 |
+
})
|
| 196 |
+
|
| 197 |
+
# Task 156 (GitLab merge request)
|
| 198 |
+
env = BrowserGymEnv(environment={
|
| 199 |
+
"BROWSERGYM_BENCHMARK": "webarena",
|
| 200 |
+
"BROWSERGYM_TASK_NAME": "156",
|
| 201 |
+
# ... URLs
|
| 202 |
+
})
|
| 203 |
+
```
|
| 204 |
+
|
| 205 |
+
**Note:** WebArena tasks require the full backend infrastructure. See [WebArena setup guide](https://github.com/web-arena-x/webarena/tree/main/environment_docker).
|
| 206 |
+
|
| 207 |
+
#### VisualWebArena Tasks (910 tasks)
|
| 208 |
+
|
| 209 |
+
Similar to WebArena but requires visual understanding. Tasks involve:
|
| 210 |
+
- Image-based reasoning
|
| 211 |
+
- Visual element identification
|
| 212 |
+
- Multimodal interaction (text + images)
|
| 213 |
+
|
| 214 |
+
#### WorkArena Tasks
|
| 215 |
+
|
| 216 |
+
Enterprise software automation tasks:
|
| 217 |
+
- CRM operations
|
| 218 |
+
- Project management
|
| 219 |
+
- Business workflows
|
| 220 |
+
|
| 221 |
+
**Full task lists:**
|
| 222 |
+
- [MiniWoB++ tasks](https://github.com/Farama-Foundation/miniwob-plusplus/tree/master/miniwob/environment)
|
| 223 |
+
- [WebArena tasks](https://github.com/web-arena-x/webarena/blob/main/config_files/)
|
| 224 |
+
- [BrowserGym documentation](https://github.com/ServiceNow/BrowserGym)
|
| 225 |
+
|
| 226 |
+
## Evaluation (WebArena)
|
| 227 |
+
|
| 228 |
+
### Prerequisites
|
| 229 |
+
|
| 230 |
+
WebArena requires setting up backend infrastructure. See the [WebArena documentation](https://github.com/web-arena-x/webarena/tree/main/environment_docker).
|
| 231 |
+
|
| 232 |
+
### Usage
|
| 233 |
+
|
| 234 |
+
```python
|
| 235 |
+
from envs.browsergym_env import BrowserGymEnv, BrowserGymAction
|
| 236 |
+
|
| 237 |
+
# Create environment for WebArena evaluation
|
| 238 |
+
env = BrowserGymEnv.from_docker_image(
|
| 239 |
+
"ghcr.io/openenv/browsergym-env:latest",
|
| 240 |
+
environment={
|
| 241 |
+
"BROWSERGYM_BENCHMARK": "webarena",
|
| 242 |
+
"BROWSERGYM_TASK_NAME": "0", # Task ID
|
| 243 |
+
# WebArena backend URLs (required)
|
| 244 |
+
"SHOPPING": "http://your-server:7770",
|
| 245 |
+
"SHOPPING_ADMIN": "http://your-server:7780/admin",
|
| 246 |
+
"REDDIT": "http://your-server:9999",
|
| 247 |
+
"GITLAB": "http://your-server:8023",
|
| 248 |
+
"MAP": "http://your-server:3000",
|
| 249 |
+
"WIKIPEDIA": "http://your-server:8888/wikipedia_en_all_maxi_2022-05/A/User:The_other_Kiwix_guy/Landing",
|
| 250 |
+
"HOMEPAGE": "http://your-server:4399",
|
| 251 |
+
}
|
| 252 |
+
)
|
| 253 |
+
|
| 254 |
+
# Evaluate your trained agent
|
| 255 |
+
result = env.reset()
|
| 256 |
+
while not result.done:
|
| 257 |
+
action_str = agent.get_action(result.observation)
|
| 258 |
+
action = BrowserGymAction(action_str=action_str)
|
| 259 |
+
result = env.step(action)
|
| 260 |
+
|
| 261 |
+
print(f"Success: {result.reward}")
|
| 262 |
+
env.close()
|
| 263 |
+
```
|
| 264 |
+
|
| 265 |
+
## Building the Docker Image
|
| 266 |
+
|
| 267 |
+
### Prerequisites
|
| 268 |
+
|
| 269 |
+
1. **Base Image**: Build the OpenEnv base image first:
|
| 270 |
+
|
| 271 |
+
```bash
|
| 272 |
+
# From the OpenEnv repository root
|
| 273 |
+
docker build -t openenv-base:latest -f src/core/containers/images/Dockerfile .
|
| 274 |
+
```
|
| 275 |
+
|
| 276 |
+
### Build the BrowserGym Environment
|
| 277 |
+
|
| 278 |
+
```bash
|
| 279 |
+
# From the OpenEnv repository root
|
| 280 |
+
docker build -t browsergym-env:latest -f src/envs/browsergym_env/server/Dockerfile .
|
| 281 |
+
```
|
| 282 |
+
|
| 283 |
+
### Run the Server
|
| 284 |
+
|
| 285 |
+
#### For MiniWoB (Training):
|
| 286 |
+
|
| 287 |
+
```bash
|
| 288 |
+
docker run -p 8000:8000 \
|
| 289 |
+
-e BROWSERGYM_BENCHMARK="miniwob" \
|
| 290 |
+
-e BROWSERGYM_TASK_NAME="click-test" \
|
| 291 |
+
browsergym-env:latest
|
| 292 |
+
```
|
| 293 |
+
|
| 294 |
+
#### For WebArena (Evaluation):
|
| 295 |
+
|
| 296 |
+
```bash
|
| 297 |
+
docker run -p 8000:8000 \
|
| 298 |
+
-e BROWSERGYM_BENCHMARK="webarena" \
|
| 299 |
+
-e BROWSERGYM_TASK_NAME="0" \
|
| 300 |
+
-e SHOPPING="http://your-server:7770" \
|
| 301 |
+
-e SHOPPING_ADMIN="http://your-server:7780/admin" \
|
| 302 |
+
-e REDDIT="http://your-server:9999" \
|
| 303 |
+
-e GITLAB="http://your-server:8023" \
|
| 304 |
+
-e MAP="http://your-server:3000" \
|
| 305 |
+
-e WIKIPEDIA="http://your-server:8888/wikipedia_en_all_maxi_2022-05/A/User:The_other_Kiwix_guy/Landing" \
|
| 306 |
+
-e HOMEPAGE="http://your-server:4399" \
|
| 307 |
+
browsergym-env:latest
|
| 308 |
+
```
|
| 309 |
+
|
| 310 |
+
## Environment Details
|
| 311 |
+
|
| 312 |
+
### Action
|
| 313 |
+
|
| 314 |
+
Actions in BrowserGym are natural language strings that describe browser operations:
|
| 315 |
+
|
| 316 |
+
```python
|
| 317 |
+
from envs.browsergym_env import BrowserGymAction
|
| 318 |
+
|
| 319 |
+
# Click actions
|
| 320 |
+
action = BrowserGymAction(action_str="click('Submit button')")
|
| 321 |
+
action = BrowserGymAction(action_str="click('element_id_123')")
|
| 322 |
+
|
| 323 |
+
# Type actions
|
| 324 |
+
action = BrowserGymAction(action_str="fill('username', 'john@example.com')")
|
| 325 |
+
action = BrowserGymAction(action_str="fill('password', 'secret123')")
|
| 326 |
+
|
| 327 |
+
# Navigate actions
|
| 328 |
+
action = BrowserGymAction(action_str="goto('https://example.com')")
|
| 329 |
+
|
| 330 |
+
# Keyboard actions
|
| 331 |
+
action = BrowserGymAction(action_str="press('Enter')")
|
| 332 |
+
action = BrowserGymAction(action_str="press('Tab')")
|
| 333 |
+
|
| 334 |
+
# Scroll actions
|
| 335 |
+
action = BrowserGymAction(action_str="scroll('down')")
|
| 336 |
+
```
|
| 337 |
+
|
| 338 |
+
### Observation
|
| 339 |
+
|
| 340 |
+
Observations contain multiple modalities:
|
| 341 |
+
|
| 342 |
+
```python
|
| 343 |
+
result = env.step(action)
|
| 344 |
+
obs = result.observation
|
| 345 |
+
|
| 346 |
+
# Text observations
|
| 347 |
+
print(obs.text) # Primary text representation (AXTree or DOM)
|
| 348 |
+
print(obs.axtree_txt) # Accessibility tree
|
| 349 |
+
print(obs.pruned_html) # Pruned HTML (interactive elements only)
|
| 350 |
+
|
| 351 |
+
# Page metadata
|
| 352 |
+
print(obs.url) # Current URL
|
| 353 |
+
print(obs.goal) # Task goal/instruction
|
| 354 |
+
|
| 355 |
+
# Visual (if enabled)
|
| 356 |
+
if obs.screenshot is not None:
|
| 357 |
+
print(obs.screenshot.shape) # [height, width, channels]
|
| 358 |
+
|
| 359 |
+
# Error handling
|
| 360 |
+
if obs.last_action_error:
|
| 361 |
+
print(f"Action failed: {obs.error}")
|
| 362 |
+
|
| 363 |
+
# Episode status
|
| 364 |
+
print(obs.done) # True if episode ended
|
| 365 |
+
print(obs.reward) # Reward for the step
|
| 366 |
+
|
| 367 |
+
# Access full BrowserGym data (includes timestamps, etc.)
|
| 368 |
+
print(obs.metadata["browsergym_obs"]) # Full observation dict from BrowserGym
|
| 369 |
+
print(obs.metadata["browsergym_info"]) # Full info dict (timestamps, page state, etc.)
|
| 370 |
+
```
|
| 371 |
+
|
| 372 |
+
#### Advanced: Accessing Raw BrowserGym Data
|
| 373 |
+
|
| 374 |
+
For VisualWebArena or custom training, you may need additional data like timestamps or browser state. The full BrowserGym observation and info dicts are preserved in `metadata`:
|
| 375 |
+
|
| 376 |
+
```python
|
| 377 |
+
result = env.step(action)
|
| 378 |
+
|
| 379 |
+
# Access timestamps (if available)
|
| 380 |
+
info = result.observation.metadata["browsergym_info"]
|
| 381 |
+
if "timestamp" in info:
|
| 382 |
+
print(f"Action timestamp: {info['timestamp']}")
|
| 383 |
+
|
| 384 |
+
# Access additional observation fields
|
| 385 |
+
obs_dict = result.observation.metadata["browsergym_obs"]
|
| 386 |
+
if "dom_object" in obs_dict:
|
| 387 |
+
dom = obs_dict["dom_object"]
|
| 388 |
+
# Work with raw DOM object
|
| 389 |
+
|
| 390 |
+
# Access page performance data
|
| 391 |
+
if "performance" in info:
|
| 392 |
+
print(f"Page load time: {info['performance']}")
|
| 393 |
+
```
|
| 394 |
+
|
| 395 |
+
### State
|
| 396 |
+
|
| 397 |
+
The environment state tracks progress:
|
| 398 |
+
|
| 399 |
+
```python
|
| 400 |
+
state = env.state()
|
| 401 |
+
|
| 402 |
+
print(f"Benchmark: {state.benchmark}") # 'miniwob', 'webarena', etc.
|
| 403 |
+
print(f"Task: {state.task_name}") # Task name/ID
|
| 404 |
+
print(f"Episode: {state.episode_id}") # Unique episode ID
|
| 405 |
+
print(f"Steps: {state.step_count}") # Number of steps taken
|
| 406 |
+
print(f"Total Reward: {state.cum_reward}") # Cumulative reward
|
| 407 |
+
print(f"Goal: {state.goal}") # Task instruction
|
| 408 |
+
print(f"URL: {state.current_url}") # Current page URL
|
| 409 |
+
```
|
| 410 |
+
|
| 411 |
+
## Configuration
|
| 412 |
+
|
| 413 |
+
Environment variables:
|
| 414 |
+
|
| 415 |
+
### Common Settings
|
| 416 |
+
- `BROWSERGYM_BENCHMARK`: Benchmark to use (`miniwob`, `webarena`, `visualwebarena`, `workarena`)
|
| 417 |
+
- `BROWSERGYM_TASK_NAME`: Specific task name (optional, will use first available if not set)
|
| 418 |
+
- `BROWSERGYM_HEADLESS`: Run browser in headless mode (default: `true`)
|
| 419 |
+
- `BROWSERGYM_VIEWPORT_WIDTH`: Browser viewport width (default: `1280`)
|
| 420 |
+
- `BROWSERGYM_VIEWPORT_HEIGHT`: Browser viewport height (default: `720`)
|
| 421 |
+
- `BROWSERGYM_TIMEOUT`: Action timeout in milliseconds (default: `10000`)
|
| 422 |
+
|
| 423 |
+
### WebArena-Specific (only needed for WebArena benchmark)
|
| 424 |
+
- `SHOPPING`: Shopping website URL
|
| 425 |
+
- `SHOPPING_ADMIN`: Shopping admin panel URL
|
| 426 |
+
- `REDDIT`: Reddit-like forum URL
|
| 427 |
+
- `GITLAB`: GitLab instance URL
|
| 428 |
+
- `MAP`: Map service URL
|
| 429 |
+
- `WIKIPEDIA`: Wikipedia instance URL
|
| 430 |
+
- `HOMEPAGE`: Homepage URL
|
| 431 |
+
|
| 432 |
+
## Supported Benchmarks
|
| 433 |
+
|
| 434 |
+
### 1. MiniWoB++ (Training) ✅ Recommended for Training
|
| 435 |
+
|
| 436 |
+
- **100+ tasks** ranging from simple (click buttons) to complex (form filling, navigation)
|
| 437 |
+
- **Fast**: Instant resets, quick episodes
|
| 438 |
+
- **Randomized**: Task variations for generalization
|
| 439 |
+
- **No setup**: Works out-of-the-box
|
| 440 |
+
- **Dense rewards**: Immediate feedback for learning
|
| 441 |
+
|
| 442 |
+
**Use Case**: Train agents on fundamental web navigation skills
|
| 443 |
+
|
| 444 |
+
### 2. WebArena (Evaluation) 📊 Benchmark
|
| 445 |
+
|
| 446 |
+
- **812 realistic tasks** across 6 websites
|
| 447 |
+
- **Complex**: Multi-step reasoning, real web interfaces
|
| 448 |
+
- **Requires setup**: Need to run 7 backend services
|
| 449 |
+
- **Sparse rewards**: Binary success/failure
|
| 450 |
+
- **Evaluation-focused**: Test real-world performance
|
| 451 |
+
|
| 452 |
+
**Use Case**: Evaluate agents on realistic web tasks
|
| 453 |
+
|
| 454 |
+
### 3. VisualWebArena (Evaluation) 👁️ Visual Benchmark
|
| 455 |
+
|
| 456 |
+
- **910 tasks** requiring visual understanding
|
| 457 |
+
- **Multimodal**: Both text and visual observations
|
| 458 |
+
- **Requires setup**: Similar to WebArena
|
| 459 |
+
- **Challenging**: Requires visual reasoning
|
| 460 |
+
|
| 461 |
+
**Use Case**: Test visual web navigation capabilities
|
| 462 |
+
|
| 463 |
+
### 4. WorkArena (Evaluation) 💼 Enterprise Benchmark
|
| 464 |
+
|
| 465 |
+
- **Enterprise tasks**: CRM, project management, etc.
|
| 466 |
+
- **Realistic workflows**: Real enterprise software
|
| 467 |
+
- **Requires setup**: Enterprise software instances
|
| 468 |
+
|
| 469 |
+
**Use Case**: Evaluate on business automation tasks
|
| 470 |
+
|
| 471 |
+
## Typical Training Pipeline
|
| 472 |
+
|
| 473 |
+
```python
|
| 474 |
+
from envs.browsergym_env import BrowserGymEnv, BrowserGymAction
|
| 475 |
+
|
| 476 |
+
# Stage 1: Train on MiniWoB (simple tasks, fast)
|
| 477 |
+
train_env = BrowserGymEnv.from_docker_image(
|
| 478 |
+
"browsergym-env:latest",
|
| 479 |
+
environment={
|
| 480 |
+
"BROWSERGYM_BENCHMARK": "miniwob",
|
| 481 |
+
"BROWSERGYM_TASK_NAME": "click-button",
|
| 482 |
+
}
|
| 483 |
+
)
|
| 484 |
+
|
| 485 |
+
# Train your agent (RL, imitation learning, etc.)
|
| 486 |
+
agent.train(train_env, num_episodes=10000)
|
| 487 |
+
train_env.close()
|
| 488 |
+
|
| 489 |
+
# Stage 2: Evaluate on WebArena (complex tasks, realistic)
|
| 490 |
+
eval_env = BrowserGymEnv.from_docker_image(
|
| 491 |
+
"browsergym-env:latest",
|
| 492 |
+
environment={
|
| 493 |
+
"BROWSERGYM_BENCHMARK": "webarena",
|
| 494 |
+
"BROWSERGYM_TASK_NAME": "0",
|
| 495 |
+
# ... WebArena URLs
|
| 496 |
+
}
|
| 497 |
+
)
|
| 498 |
+
|
| 499 |
+
# Test performance
|
| 500 |
+
success_rate = agent.evaluate(eval_env, num_tasks=812)
|
| 501 |
+
print(f"WebArena Success Rate: {success_rate:.2%}")
|
| 502 |
+
eval_env.close()
|
| 503 |
+
```
|
| 504 |
+
|
| 505 |
+
## Development & Testing
|
| 506 |
+
|
| 507 |
+
### Running Tests
|
| 508 |
+
|
| 509 |
+
```bash
|
| 510 |
+
# From the OpenEnv repository root
|
| 511 |
+
pytest tests/envs/test_browsergym_env.py
|
| 512 |
+
```
|
| 513 |
+
|
| 514 |
+
### Local Development
|
| 515 |
+
|
| 516 |
+
```bash
|
| 517 |
+
# Install in development mode
|
| 518 |
+
cd /path/to/OpenEnv
|
| 519 |
+
pip install -e .
|
| 520 |
+
|
| 521 |
+
# Install BrowserGym
|
| 522 |
+
pip install browsergym browsergym-miniwob browsergym-webarena
|
| 523 |
+
|
| 524 |
+
# Run the server locally
|
| 525 |
+
cd src/envs/browsergym_env/server
|
| 526 |
+
export BROWSERGYM_BENCHMARK=miniwob
|
| 527 |
+
export BROWSERGYM_TASK_NAME=click-test
|
| 528 |
+
python app.py
|
| 529 |
+
```
|
| 530 |
+
|
| 531 |
+
## Project Structure
|
| 532 |
+
|
| 533 |
+
```
|
| 534 |
+
browsergym_env/
|
| 535 |
+
├── __init__.py # Module exports
|
| 536 |
+
├── models.py # Action, Observation, State dataclasses
|
| 537 |
+
├── client.py # HTTPEnvClient implementation
|
| 538 |
+
├── README.md # This file
|
| 539 |
+
└── server/
|
| 540 |
+
├── __init__.py
|
| 541 |
+
├── app.py # FastAPI application
|
| 542 |
+
├── browsergym_environment.py # Environment implementation
|
| 543 |
+
├── Dockerfile # Container specification
|
| 544 |
+
└── requirements.txt # Python dependencies
|
| 545 |
+
```
|
| 546 |
+
|
| 547 |
+
## References
|
| 548 |
+
|
| 549 |
+
- [BrowserGym GitHub](https://github.com/ServiceNow/BrowserGym)
|
| 550 |
+
- [MiniWoB++ Paper](https://arxiv.org/abs/1802.08802)
|
| 551 |
+
- [WebArena Paper](https://arxiv.org/abs/2307.13854)
|
| 552 |
+
- [WebArena Website](https://webarena.dev/)
|
| 553 |
+
- [VisualWebArena Paper](https://jykoh.com/vwa)
|
| 554 |
+
- [OpenEnv Documentation](https://github.com/meta-pytorch/OpenEnv)
|
__init__.py
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""BrowserGym Environment for OpenEnv.
|
| 2 |
+
|
| 3 |
+
BrowserGym is a unified framework for web-based agent tasks that provides
|
| 4 |
+
access to multiple benchmarks under a single Gymnasium-compatible API.
|
| 5 |
+
|
| 6 |
+
Included Benchmarks:
|
| 7 |
+
- **MiniWoB++**: 100+ simple web tasks for training (no external infrastructure!)
|
| 8 |
+
- **WebArena**: 812 realistic evaluation tasks (requires backend setup)
|
| 9 |
+
- **VisualWebArena**: Visual web navigation tasks
|
| 10 |
+
- **WorkArena**: Enterprise task automation
|
| 11 |
+
|
| 12 |
+
Key Features:
|
| 13 |
+
- Unified API across all benchmarks
|
| 14 |
+
- Gymnasium-compatible interface
|
| 15 |
+
- Support for multiple observation types (text, visual, DOM)
|
| 16 |
+
- Action spaces for natural language commands
|
| 17 |
+
- Perfect for training (MiniWoB) and evaluation (WebArena)
|
| 18 |
+
|
| 19 |
+
Training Example (MiniWoB - works immediately):
|
| 20 |
+
```python
|
| 21 |
+
from envs.browsergym_env import BrowserGymEnv, BrowserGymAction
|
| 22 |
+
|
| 23 |
+
# Create training environment - no backend setup needed!
|
| 24 |
+
env = BrowserGymEnv.from_docker_image(
|
| 25 |
+
"browsergym-env:latest",
|
| 26 |
+
environment={
|
| 27 |
+
"BROWSERGYM_BENCHMARK": "miniwob",
|
| 28 |
+
"BROWSERGYM_TASK_NAME": "click-test",
|
| 29 |
+
}
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
# Train your agent
|
| 33 |
+
for episode in range(1000):
|
| 34 |
+
result = env.reset()
|
| 35 |
+
while not result.done:
|
| 36 |
+
action = agent.get_action(result.observation)
|
| 37 |
+
result = env.step(action)
|
| 38 |
+
|
| 39 |
+
env.close()
|
| 40 |
+
```
|
| 41 |
+
|
| 42 |
+
Evaluation Example (WebArena - requires backend):
|
| 43 |
+
```python
|
| 44 |
+
from envs.browsergym_env import BrowserGymEnv, BrowserGymAction
|
| 45 |
+
|
| 46 |
+
# Create evaluation environment
|
| 47 |
+
env = BrowserGymEnv.from_docker_image(
|
| 48 |
+
"browsergym-env:latest",
|
| 49 |
+
environment={
|
| 50 |
+
"BROWSERGYM_BENCHMARK": "webarena",
|
| 51 |
+
"BROWSERGYM_TASK_NAME": "0",
|
| 52 |
+
"SHOPPING": "http://your-server:7770",
|
| 53 |
+
# ... other backend URLs
|
| 54 |
+
}
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
# Evaluate your trained agent
|
| 58 |
+
result = env.reset()
|
| 59 |
+
# ... run evaluation
|
| 60 |
+
env.close()
|
| 61 |
+
```
|
| 62 |
+
"""
|
| 63 |
+
|
| 64 |
+
from .client import BrowserGymEnv
|
| 65 |
+
from .models import BrowserGymAction, BrowserGymObservation, BrowserGymState
|
| 66 |
+
|
| 67 |
+
__all__ = [
|
| 68 |
+
"BrowserGymEnv",
|
| 69 |
+
"BrowserGymAction",
|
| 70 |
+
"BrowserGymObservation",
|
| 71 |
+
"BrowserGymState",
|
| 72 |
+
]
|
client.py
ADDED
|
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""HTTP client for the BrowserGym environment."""
|
| 2 |
+
|
| 3 |
+
from typing import Any, Dict
|
| 4 |
+
|
| 5 |
+
from core.http_env_client import HTTPEnvClient, StepResult
|
| 6 |
+
from envs.browsergym_env.models import (
|
| 7 |
+
BrowserGymAction,
|
| 8 |
+
BrowserGymObservation,
|
| 9 |
+
BrowserGymState,
|
| 10 |
+
)
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class BrowserGymEnv(HTTPEnvClient[BrowserGymAction, BrowserGymObservation]):
|
| 14 |
+
"""Client for interacting with the BrowserGym environment over HTTP.
|
| 15 |
+
|
| 16 |
+
BrowserGym provides unified access to multiple web navigation benchmarks:
|
| 17 |
+
- MiniWoB++: 100+ training tasks (no external infrastructure needed!)
|
| 18 |
+
- WebArena: 812 evaluation tasks (requires backend setup)
|
| 19 |
+
- VisualWebArena: Visual navigation tasks
|
| 20 |
+
- WorkArena: Enterprise automation tasks
|
| 21 |
+
|
| 22 |
+
Example usage for TRAINING (MiniWoB - works out of the box):
|
| 23 |
+
```python
|
| 24 |
+
from envs.browsergym_env import BrowserGymEnv, BrowserGymAction
|
| 25 |
+
|
| 26 |
+
# Create environment for MiniWoB training task
|
| 27 |
+
env = BrowserGymEnv.from_docker_image(
|
| 28 |
+
"browsergym-env:latest",
|
| 29 |
+
environment={
|
| 30 |
+
"BROWSERGYM_BENCHMARK": "miniwob",
|
| 31 |
+
"BROWSERGYM_TASK_NAME": "click-test",
|
| 32 |
+
}
|
| 33 |
+
)
|
| 34 |
+
|
| 35 |
+
# Reset and get initial observation
|
| 36 |
+
result = env.reset()
|
| 37 |
+
print(f"Task: {result.observation.goal}")
|
| 38 |
+
print(f"Page: {result.observation.text[:200]}")
|
| 39 |
+
|
| 40 |
+
# Take actions
|
| 41 |
+
action = BrowserGymAction(action_str="click('Submit button')")
|
| 42 |
+
result = env.step(action)
|
| 43 |
+
print(f"Reward: {result.reward}")
|
| 44 |
+
print(f"Done: {result.done}")
|
| 45 |
+
|
| 46 |
+
env.close()
|
| 47 |
+
```
|
| 48 |
+
|
| 49 |
+
Example usage for EVALUATION (WebArena - requires backend):
|
| 50 |
+
```python
|
| 51 |
+
from envs.browsergym_env import BrowserGymEnv, BrowserGymAction
|
| 52 |
+
|
| 53 |
+
# Create environment for WebArena evaluation
|
| 54 |
+
env = BrowserGymEnv.from_docker_image(
|
| 55 |
+
"browsergym-env:latest",
|
| 56 |
+
environment={
|
| 57 |
+
"BROWSERGYM_BENCHMARK": "webarena",
|
| 58 |
+
"BROWSERGYM_TASK_NAME": "0", # Task 0
|
| 59 |
+
# WebArena backend URLs
|
| 60 |
+
"SHOPPING": "http://your-server:7770",
|
| 61 |
+
"GITLAB": "http://your-server:8023",
|
| 62 |
+
# ... other URLs
|
| 63 |
+
}
|
| 64 |
+
)
|
| 65 |
+
|
| 66 |
+
result = env.reset()
|
| 67 |
+
# ... interact with environment
|
| 68 |
+
env.close()
|
| 69 |
+
```
|
| 70 |
+
|
| 71 |
+
Available benchmarks:
|
| 72 |
+
- miniwob: MiniWoB++ tasks (training, no setup required)
|
| 73 |
+
- webarena: WebArena tasks (evaluation, requires backend)
|
| 74 |
+
- visualwebarena: Visual WebArena tasks (evaluation, requires backend)
|
| 75 |
+
- workarena: WorkArena tasks (evaluation, requires backend)
|
| 76 |
+
"""
|
| 77 |
+
|
| 78 |
+
def _step_payload(self, action: BrowserGymAction) -> Dict[str, Any]:
|
| 79 |
+
"""Convert a BrowserGymAction to the JSON payload for the server."""
|
| 80 |
+
return {
|
| 81 |
+
"action_str": action.action_str,
|
| 82 |
+
"metadata": action.metadata,
|
| 83 |
+
}
|
| 84 |
+
|
| 85 |
+
def _parse_result(
|
| 86 |
+
self, payload: Dict[str, Any]
|
| 87 |
+
) -> StepResult[BrowserGymObservation]:
|
| 88 |
+
"""Parse the server response into a StepResult."""
|
| 89 |
+
obs_data = payload.get("observation", {})
|
| 90 |
+
|
| 91 |
+
observation = BrowserGymObservation(
|
| 92 |
+
text=obs_data.get("text", ""),
|
| 93 |
+
url=obs_data.get("url", ""),
|
| 94 |
+
screenshot=obs_data.get("screenshot"),
|
| 95 |
+
goal=obs_data.get("goal", ""),
|
| 96 |
+
axtree_txt=obs_data.get("axtree_txt", ""),
|
| 97 |
+
pruned_html=obs_data.get("pruned_html", ""),
|
| 98 |
+
error=obs_data.get("error", ""),
|
| 99 |
+
last_action_error=obs_data.get("last_action_error", False),
|
| 100 |
+
done=payload.get("done", False),
|
| 101 |
+
reward=payload.get("reward"),
|
| 102 |
+
metadata=obs_data.get("metadata", {}),
|
| 103 |
+
)
|
| 104 |
+
|
| 105 |
+
return StepResult(
|
| 106 |
+
observation=observation,
|
| 107 |
+
reward=payload.get("reward"),
|
| 108 |
+
done=payload.get("done", False),
|
| 109 |
+
)
|
| 110 |
+
|
| 111 |
+
def _parse_state(self, payload: Dict[str, Any]) -> BrowserGymState:
|
| 112 |
+
"""Parse the server state response into a BrowserGymState object."""
|
| 113 |
+
return BrowserGymState(
|
| 114 |
+
episode_id=payload.get("episode_id"),
|
| 115 |
+
step_count=payload.get("step_count", 0),
|
| 116 |
+
benchmark=payload.get("benchmark", ""),
|
| 117 |
+
task_name=payload.get("task_name", ""),
|
| 118 |
+
task_id=payload.get("task_id"),
|
| 119 |
+
goal=payload.get("goal", ""),
|
| 120 |
+
current_url=payload.get("current_url", ""),
|
| 121 |
+
max_steps=payload.get("max_steps"),
|
| 122 |
+
cum_reward=payload.get("cum_reward", 0.0),
|
| 123 |
+
)
|
models.py
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Data models for the BrowserGym environment.
|
| 2 |
+
|
| 3 |
+
BrowserGym is a unified framework for web-based agent tasks, combining multiple
|
| 4 |
+
benchmarks including MiniWoB (training), WebArena (evaluation), VisualWebArena,
|
| 5 |
+
and more under a single Gymnasium-compatible API.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from dataclasses import dataclass
|
| 9 |
+
from typing import List, Optional
|
| 10 |
+
|
| 11 |
+
from core.env_server.types import Action, Observation, State
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
@dataclass(kw_only=True)
|
| 15 |
+
class BrowserGymAction(Action):
|
| 16 |
+
"""Action to be executed in the BrowserGym environment.
|
| 17 |
+
|
| 18 |
+
BrowserGym supports high-level natural language actions that can be parsed
|
| 19 |
+
into browser operations.
|
| 20 |
+
|
| 21 |
+
Example actions:
|
| 22 |
+
- "click('Submit button')"
|
| 23 |
+
- "fill('username', 'john@example.com')"
|
| 24 |
+
- "goto('https://example.com')"
|
| 25 |
+
- "scroll(down)"
|
| 26 |
+
- "send_keys('Enter')"
|
| 27 |
+
"""
|
| 28 |
+
|
| 29 |
+
action_str: str
|
| 30 |
+
"""Natural language action string (e.g., "click('Submit')")"""
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
@dataclass(kw_only=True)
|
| 34 |
+
class BrowserGymObservation(Observation):
|
| 35 |
+
"""Observation returned from the BrowserGym environment.
|
| 36 |
+
|
| 37 |
+
Contains multiple observation modalities including text (accessibility tree
|
| 38 |
+
or DOM), visual (screenshot), and page metadata.
|
| 39 |
+
"""
|
| 40 |
+
|
| 41 |
+
text: str = ""
|
| 42 |
+
"""Text representation of the page (accessibility tree or DOM)"""
|
| 43 |
+
|
| 44 |
+
url: str = ""
|
| 45 |
+
"""Current URL of the page"""
|
| 46 |
+
|
| 47 |
+
screenshot: Optional[List[List[List[int]]]] = None
|
| 48 |
+
"""Screenshot as numpy array [height, width, channels] (if visual observation enabled)"""
|
| 49 |
+
|
| 50 |
+
goal: str = ""
|
| 51 |
+
"""Task goal/instruction for the current episode"""
|
| 52 |
+
|
| 53 |
+
axtree_txt: str = ""
|
| 54 |
+
"""Full accessibility tree as text"""
|
| 55 |
+
|
| 56 |
+
pruned_html: str = ""
|
| 57 |
+
"""Pruned HTML content (interactive elements only)"""
|
| 58 |
+
|
| 59 |
+
error: str = ""
|
| 60 |
+
"""Error message if action execution failed"""
|
| 61 |
+
|
| 62 |
+
last_action_error: bool = False
|
| 63 |
+
"""Whether the last action resulted in an error"""
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
@dataclass
|
| 67 |
+
class BrowserGymState(State):
|
| 68 |
+
"""State of the BrowserGym environment.
|
| 69 |
+
|
| 70 |
+
Tracks the current benchmark, task, and progress through an episode.
|
| 71 |
+
"""
|
| 72 |
+
|
| 73 |
+
benchmark: str = ""
|
| 74 |
+
"""Benchmark name (e.g., 'miniwob', 'webarena', 'visualwebarena')"""
|
| 75 |
+
|
| 76 |
+
task_name: str = ""
|
| 77 |
+
"""Specific task within the benchmark (e.g., 'click-test', 'click-button')"""
|
| 78 |
+
|
| 79 |
+
task_id: Optional[str] = None
|
| 80 |
+
"""Task ID for evaluation benchmarks (e.g., WebArena task number)"""
|
| 81 |
+
|
| 82 |
+
goal: str = ""
|
| 83 |
+
"""Task goal/instruction"""
|
| 84 |
+
|
| 85 |
+
current_url: str = ""
|
| 86 |
+
"""Current URL of the active page"""
|
| 87 |
+
|
| 88 |
+
max_steps: Optional[int] = None
|
| 89 |
+
"""Maximum steps allowed for this task"""
|
| 90 |
+
|
| 91 |
+
cum_reward: float = 0.0
|
| 92 |
+
"""Cumulative reward for the current episode"""
|
openenv.yaml
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: browsergym_env
|
| 2 |
+
version: "0.1.0"
|
| 3 |
+
description: "BrowserGym environment for web automation tasks using Playwright"
|
| 4 |
+
action: BrowserGymAction
|
| 5 |
+
observation: BrowserGymObservation
|
pyproject.toml
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[build-system]
|
| 2 |
+
requires = ["setuptools>=45", "wheel"]
|
| 3 |
+
build-backend = "setuptools.build_meta"
|
| 4 |
+
|
| 5 |
+
[project]
|
| 6 |
+
name = "openenv-browsergym_env"
|
| 7 |
+
version = "0.1.0"
|
| 8 |
+
description = "BrowserGym Environment for OpenEnv - Web automation using Playwright"
|
| 9 |
+
requires-python = ">=3.10"
|
| 10 |
+
dependencies = [
|
| 11 |
+
"openenv-core>=0.1.0",
|
| 12 |
+
"fastapi>=0.104.0",
|
| 13 |
+
"uvicorn>=0.24.0",
|
| 14 |
+
"browsergym-core>=0.2.0",
|
| 15 |
+
"browsergym-miniwob>=0.2.0",
|
| 16 |
+
"browsergym-webarena>=0.2.0",
|
| 17 |
+
"gymnasium>=0.29.0",
|
| 18 |
+
"playwright>=1.40.0",
|
| 19 |
+
"Pillow>=10.0.0",
|
| 20 |
+
]
|
| 21 |
+
|
| 22 |
+
[project.optional-dependencies]
|
| 23 |
+
dev = [
|
| 24 |
+
"pytest>=8.0.0",
|
| 25 |
+
"pytest-cov>=4.0.0",
|
| 26 |
+
]
|
| 27 |
+
|
| 28 |
+
[tool.setuptools]
|
| 29 |
+
packages = ["browsergym_env", "browsergym_env.server"]
|
| 30 |
+
package-dir = { "browsergym_env" = ".", "browsergym_env.server" = "server" }
|
| 31 |
+
|
| 32 |
+
[tool.setuptools.package-data]
|
| 33 |
+
browsergym_env = ["**/*.yaml", "**/*.yml", "**/*.md"]
|
server/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""BrowserGym environment server module."""
|
server/app.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""FastAPI server for the BrowserGym environment."""
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
|
| 5 |
+
from core.env_server.http_server import create_app
|
| 6 |
+
from envs.browsergym_env.models import (
|
| 7 |
+
BrowserGymAction,
|
| 8 |
+
BrowserGymObservation,
|
| 9 |
+
)
|
| 10 |
+
from envs.browsergym_env.server.browsergym_environment import BrowserGymEnvironment
|
| 11 |
+
|
| 12 |
+
# Get configuration from environment variables
|
| 13 |
+
benchmark = os.environ.get("BROWSERGYM_BENCHMARK", "miniwob")
|
| 14 |
+
task_name = os.environ.get("BROWSERGYM_TASK_NAME") # Optional, can be None
|
| 15 |
+
headless = os.environ.get("BROWSERGYM_HEADLESS", "true").lower() == "true"
|
| 16 |
+
viewport_width = int(os.environ.get("BROWSERGYM_VIEWPORT_WIDTH", "1280"))
|
| 17 |
+
viewport_height = int(os.environ.get("BROWSERGYM_VIEWPORT_HEIGHT", "720"))
|
| 18 |
+
timeout = float(os.environ.get("BROWSERGYM_TIMEOUT", "10000"))
|
| 19 |
+
port = int(os.environ.get("BROWSERGYM_PORT", "8000"))
|
| 20 |
+
|
| 21 |
+
# Create the environment instance
|
| 22 |
+
env = BrowserGymEnvironment(
|
| 23 |
+
benchmark=benchmark,
|
| 24 |
+
task_name=task_name,
|
| 25 |
+
headless=headless,
|
| 26 |
+
viewport_width=viewport_width,
|
| 27 |
+
viewport_height=viewport_height,
|
| 28 |
+
timeout=timeout,
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
+
# Create the FastAPI app
|
| 32 |
+
app = create_app(
|
| 33 |
+
env,
|
| 34 |
+
BrowserGymAction,
|
| 35 |
+
BrowserGymObservation,
|
| 36 |
+
env_name="browsergym_env",
|
| 37 |
+
)
|
| 38 |
+
|
| 39 |
+
if __name__ == "__main__":
|
| 40 |
+
import uvicorn
|
| 41 |
+
|
| 42 |
+
uvicorn.run(app, host="0.0.0.0", port=port)
|
server/browsergym_environment.py
ADDED
|
@@ -0,0 +1,303 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""BrowserGym Environment implementation for OpenEnv.
|
| 2 |
+
|
| 3 |
+
This module wraps the BrowserGym framework to provide a compatible interface
|
| 4 |
+
with OpenEnv's Environment ABC. BrowserGym includes multiple benchmarks:
|
| 5 |
+
- MiniWoB++: Training environment with 100+ simple web tasks
|
| 6 |
+
- WebArena: Realistic evaluation with 812 complex tasks
|
| 7 |
+
- VisualWebArena: Visual web navigation tasks
|
| 8 |
+
- WorkArena: Enterprise task automation
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
import importlib
|
| 12 |
+
import os
|
| 13 |
+
from typing import Any, Dict, Optional
|
| 14 |
+
from uuid import uuid4
|
| 15 |
+
|
| 16 |
+
import gymnasium as gym
|
| 17 |
+
|
| 18 |
+
from core.env_server.interfaces import Environment
|
| 19 |
+
from envs.browsergym_env.models import (
|
| 20 |
+
BrowserGymAction,
|
| 21 |
+
BrowserGymObservation,
|
| 22 |
+
BrowserGymState,
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
_MINIWOB_LOAD_HELP = (
|
| 27 |
+
"MiniWoB tasks require the MiniWoB HTML bundle to be served over HTTP. "
|
| 28 |
+
"The official BrowserGym Docker image handles this automatically by "
|
| 29 |
+
"serving the bundle on port 8888. For custom or non-Docker deployments, "
|
| 30 |
+
"clone the MiniWoB++ repository, start a static server inside "
|
| 31 |
+
"`miniwob-plusplus/miniwob/html` (e.g. `python -m http.server 8888`), and "
|
| 32 |
+
"set the MINIWOB_URL environment variable to the served base URL such as "
|
| 33 |
+
"`http://localhost:8888/miniwob/`."
|
| 34 |
+
)
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
class BrowserGymEnvironment(Environment):
|
| 38 |
+
"""BrowserGym environment wrapper for OpenEnv.
|
| 39 |
+
|
| 40 |
+
This environment wraps BrowserGym's Gymnasium-compatible environments to
|
| 41 |
+
provide unified access to multiple web navigation benchmarks.
|
| 42 |
+
"""
|
| 43 |
+
|
| 44 |
+
def __init__(
|
| 45 |
+
self,
|
| 46 |
+
benchmark: str = "miniwob",
|
| 47 |
+
task_name: Optional[str] = None,
|
| 48 |
+
headless: bool = True,
|
| 49 |
+
viewport_width: int = 1280,
|
| 50 |
+
viewport_height: int = 720,
|
| 51 |
+
timeout: float = 10000.0,
|
| 52 |
+
**gym_kwargs: Any,
|
| 53 |
+
):
|
| 54 |
+
"""Initialize the BrowserGym environment.
|
| 55 |
+
|
| 56 |
+
Args:
|
| 57 |
+
benchmark: Benchmark to use ('miniwob', 'webarena', 'visualwebarena', etc.)
|
| 58 |
+
task_name: Specific task within the benchmark (e.g., 'click-test', 'click-button')
|
| 59 |
+
If None, will use first available task
|
| 60 |
+
headless: Whether to run browser in headless mode
|
| 61 |
+
viewport_width: Browser viewport width
|
| 62 |
+
viewport_height: Browser viewport height
|
| 63 |
+
timeout: Action timeout in milliseconds
|
| 64 |
+
**gym_kwargs: Additional arguments passed to gym.make()
|
| 65 |
+
"""
|
| 66 |
+
super().__init__()
|
| 67 |
+
|
| 68 |
+
self.benchmark = benchmark
|
| 69 |
+
self.task_name = task_name
|
| 70 |
+
self.headless = headless
|
| 71 |
+
self.viewport_width = viewport_width
|
| 72 |
+
self.viewport_height = viewport_height
|
| 73 |
+
self.timeout = timeout
|
| 74 |
+
self.gym_kwargs = dict(gym_kwargs)
|
| 75 |
+
|
| 76 |
+
# Build environment ID
|
| 77 |
+
if task_name:
|
| 78 |
+
self.env_id = f"browsergym/{benchmark}.{task_name}"
|
| 79 |
+
else:
|
| 80 |
+
self.env_id = f"browsergym/{benchmark}"
|
| 81 |
+
|
| 82 |
+
# force import the benchmark module
|
| 83 |
+
benchmark_modules = {
|
| 84 |
+
"miniwob": "browsergym.miniwob",
|
| 85 |
+
"webarena": "browsergym.webarena",
|
| 86 |
+
"visualwebarena": "browsergym.visualwebarena",
|
| 87 |
+
"workarena": "browsergym.workarena",
|
| 88 |
+
}
|
| 89 |
+
module_path = benchmark_modules.get(benchmark)
|
| 90 |
+
try:
|
| 91 |
+
if module_path:
|
| 92 |
+
importlib.import_module(module_path)
|
| 93 |
+
else:
|
| 94 |
+
importlib.import_module("browsergym")
|
| 95 |
+
except ModuleNotFoundError as import_error:
|
| 96 |
+
message = (
|
| 97 |
+
"Failed to import BrowserGym benchmark "
|
| 98 |
+
f"'{benchmark}': {import_error}\n"
|
| 99 |
+
"Install the matching browsergym package "
|
| 100 |
+
f"(e.g., browsergym-{benchmark})."
|
| 101 |
+
)
|
| 102 |
+
raise ValueError(message) from import_error
|
| 103 |
+
|
| 104 |
+
# Create the BrowserGym environment
|
| 105 |
+
try:
|
| 106 |
+
self.gym_env = gym.make(
|
| 107 |
+
self.env_id,
|
| 108 |
+
headless=headless,
|
| 109 |
+
viewport={"width": viewport_width, "height": viewport_height},
|
| 110 |
+
timeout=timeout,
|
| 111 |
+
**self.gym_kwargs,
|
| 112 |
+
)
|
| 113 |
+
except Exception as e: # noqa: BLE001 - gym.make
|
| 114 |
+
message = (
|
| 115 |
+
"Failed to create BrowserGym environment "
|
| 116 |
+
f"'{self.env_id}': {e}\n"
|
| 117 |
+
"Make sure the benchmark package is installed "
|
| 118 |
+
f"(e.g., pip install browsergym-{benchmark})."
|
| 119 |
+
)
|
| 120 |
+
raise ValueError(message) from e
|
| 121 |
+
|
| 122 |
+
# State tracking
|
| 123 |
+
self._state = BrowserGymState(
|
| 124 |
+
episode_id=str(uuid4()),
|
| 125 |
+
step_count=0,
|
| 126 |
+
benchmark=benchmark,
|
| 127 |
+
task_name=task_name or "",
|
| 128 |
+
)
|
| 129 |
+
|
| 130 |
+
self._last_obs: Optional[Dict[str, Any]] = None
|
| 131 |
+
self._last_info: Optional[Dict[str, Any]] = None
|
| 132 |
+
|
| 133 |
+
def reset(
|
| 134 |
+
self,
|
| 135 |
+
seed: Optional[int] = None,
|
| 136 |
+
task_name: Optional[str] = None,
|
| 137 |
+
) -> BrowserGymObservation:
|
| 138 |
+
"""Reset the environment with a specific task.
|
| 139 |
+
|
| 140 |
+
Args:
|
| 141 |
+
seed: Random seed for reproducibility
|
| 142 |
+
task_name: Override task name for this episode
|
| 143 |
+
|
| 144 |
+
Returns:
|
| 145 |
+
Initial observation for the task
|
| 146 |
+
"""
|
| 147 |
+
# Generate new episode ID
|
| 148 |
+
self._state = BrowserGymState(
|
| 149 |
+
episode_id=str(uuid4()),
|
| 150 |
+
step_count=0,
|
| 151 |
+
benchmark=self.benchmark,
|
| 152 |
+
task_name=task_name or self.task_name or "",
|
| 153 |
+
)
|
| 154 |
+
|
| 155 |
+
# Reset options
|
| 156 |
+
reset_options = {}
|
| 157 |
+
if seed is not None:
|
| 158 |
+
reset_options["seed"] = seed
|
| 159 |
+
|
| 160 |
+
# Reset the gym environment
|
| 161 |
+
try:
|
| 162 |
+
obs, info = self.gym_env.reset(**reset_options)
|
| 163 |
+
except AttributeError as err:
|
| 164 |
+
if "context" in str(err) and hasattr(self.gym_env, "close"):
|
| 165 |
+
# BrowserGym can leave partially initialized state after a
|
| 166 |
+
# failed reset. Close the hanging resources and try once more.
|
| 167 |
+
self.gym_env.close()
|
| 168 |
+
obs, info = self.gym_env.reset(**reset_options)
|
| 169 |
+
else:
|
| 170 |
+
raise
|
| 171 |
+
except Exception as err: # noqa: BLE001 - browsergym
|
| 172 |
+
message = str(err)
|
| 173 |
+
if self.benchmark == "miniwob" and "core is not defined" in message:
|
| 174 |
+
raise ValueError(_MINIWOB_LOAD_HELP) from err
|
| 175 |
+
raise
|
| 176 |
+
|
| 177 |
+
self._last_obs = obs
|
| 178 |
+
self._last_info = info
|
| 179 |
+
|
| 180 |
+
# Extract observation details
|
| 181 |
+
return self._create_observation(obs, info, done=False, reward=0.0)
|
| 182 |
+
|
| 183 |
+
def step(self, action: BrowserGymAction) -> BrowserGymObservation:
|
| 184 |
+
"""Execute an action in the environment.
|
| 185 |
+
|
| 186 |
+
Args:
|
| 187 |
+
action: The action to execute
|
| 188 |
+
|
| 189 |
+
Returns:
|
| 190 |
+
Observation after executing the action
|
| 191 |
+
"""
|
| 192 |
+
self._state.step_count += 1
|
| 193 |
+
|
| 194 |
+
# Execute action in gym environment
|
| 195 |
+
try:
|
| 196 |
+
obs, reward, terminated, truncated, info = self.gym_env.step(
|
| 197 |
+
action.action_str
|
| 198 |
+
)
|
| 199 |
+
|
| 200 |
+
self._last_obs = obs
|
| 201 |
+
self._last_info = info
|
| 202 |
+
|
| 203 |
+
# Update state
|
| 204 |
+
done = terminated or truncated
|
| 205 |
+
self._state.cum_reward += float(reward)
|
| 206 |
+
|
| 207 |
+
# Extract goal from info if available
|
| 208 |
+
if "goal" in info:
|
| 209 |
+
self._state.goal = str(info["goal"])
|
| 210 |
+
|
| 211 |
+
return self._create_observation(obs, info, done=done, reward=float(reward))
|
| 212 |
+
|
| 213 |
+
except Exception as e:
|
| 214 |
+
# Handle action execution errors
|
| 215 |
+
error_msg = str(e)
|
| 216 |
+
return BrowserGymObservation(
|
| 217 |
+
text=self._last_obs.get("text", "") if self._last_obs else "",
|
| 218 |
+
url=self._last_obs.get("url", "") if self._last_obs else "",
|
| 219 |
+
goal=self._state.goal,
|
| 220 |
+
error=error_msg,
|
| 221 |
+
last_action_error=True,
|
| 222 |
+
done=False,
|
| 223 |
+
reward=0.0,
|
| 224 |
+
)
|
| 225 |
+
|
| 226 |
+
def _create_observation(
|
| 227 |
+
self,
|
| 228 |
+
obs: Dict[str, Any],
|
| 229 |
+
info: Dict[str, Any],
|
| 230 |
+
done: bool,
|
| 231 |
+
reward: float,
|
| 232 |
+
) -> BrowserGymObservation:
|
| 233 |
+
"""Convert BrowserGym observation to OpenEnv format.
|
| 234 |
+
|
| 235 |
+
Args:
|
| 236 |
+
obs: BrowserGym observation dict
|
| 237 |
+
info: BrowserGym info dict
|
| 238 |
+
done: Whether episode is done
|
| 239 |
+
reward: Reward for the step
|
| 240 |
+
|
| 241 |
+
Returns:
|
| 242 |
+
BrowserGymObservation
|
| 243 |
+
"""
|
| 244 |
+
# Extract text observation (could be AXTree, DOM, or other)
|
| 245 |
+
text = ""
|
| 246 |
+
if "axtree_txt" in obs:
|
| 247 |
+
text = obs["axtree_txt"]
|
| 248 |
+
elif "pruned_html" in obs:
|
| 249 |
+
text = obs["pruned_html"]
|
| 250 |
+
elif "dom_txt" in obs:
|
| 251 |
+
text = obs["dom_txt"]
|
| 252 |
+
elif isinstance(obs, str):
|
| 253 |
+
text = obs
|
| 254 |
+
|
| 255 |
+
# Extract URL
|
| 256 |
+
url = info.get("url", "")
|
| 257 |
+
if not url and "page" in info:
|
| 258 |
+
url = info["page"].get("url", "")
|
| 259 |
+
|
| 260 |
+
# Extract goal/instruction
|
| 261 |
+
goal = info.get("goal", "")
|
| 262 |
+
if not goal and "task" in info:
|
| 263 |
+
goal = info["task"].get("goal", "")
|
| 264 |
+
|
| 265 |
+
# Update state
|
| 266 |
+
self._state.current_url = url
|
| 267 |
+
self._state.goal = goal
|
| 268 |
+
|
| 269 |
+
# Extract additional observation modalities
|
| 270 |
+
screenshot = obs.get("screenshot") if isinstance(obs, dict) else None
|
| 271 |
+
axtree_txt = obs.get("axtree_txt", "") if isinstance(obs, dict) else ""
|
| 272 |
+
pruned_html = obs.get("pruned_html", "") if isinstance(obs, dict) else ""
|
| 273 |
+
|
| 274 |
+
# Store full BrowserGym observation and info in metadata
|
| 275 |
+
# This preserves timestamps, additional fields, and any future extensions
|
| 276 |
+
browsergym_metadata = {
|
| 277 |
+
"browsergym_obs": obs if isinstance(obs, dict) else {},
|
| 278 |
+
"browsergym_info": info,
|
| 279 |
+
}
|
| 280 |
+
|
| 281 |
+
return BrowserGymObservation(
|
| 282 |
+
text=text,
|
| 283 |
+
url=url,
|
| 284 |
+
screenshot=screenshot,
|
| 285 |
+
goal=goal,
|
| 286 |
+
axtree_txt=axtree_txt,
|
| 287 |
+
pruned_html=pruned_html,
|
| 288 |
+
error="",
|
| 289 |
+
last_action_error=False,
|
| 290 |
+
done=done,
|
| 291 |
+
reward=reward,
|
| 292 |
+
metadata=browsergym_metadata,
|
| 293 |
+
)
|
| 294 |
+
|
| 295 |
+
@property
|
| 296 |
+
def state(self) -> BrowserGymState:
|
| 297 |
+
"""Get the current environment state."""
|
| 298 |
+
return self._state
|
| 299 |
+
|
| 300 |
+
def close(self) -> None:
|
| 301 |
+
"""Clean up environment resources."""
|
| 302 |
+
if hasattr(self, "gym_env"):
|
| 303 |
+
self.gym_env.close()
|
server/requirements.txt
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
browsergym>=0.2.0
|
| 2 |
+
browsergym-core>=0.2.0
|
| 3 |
+
browsergym-miniwob>=0.2.0
|
| 4 |
+
browsergym-webarena>=0.2.0
|
| 5 |
+
gymnasium>=0.29.0
|
| 6 |
+
playwright>=1.40.0
|
| 7 |
+
Pillow>=10.0.0
|
| 8 |
+
fastapi>=0.104.0
|
| 9 |
+
uvicorn>=0.24.0
|
server/start.sh
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
set -euo pipefail
|
| 3 |
+
|
| 4 |
+
MINIWOB_HTML_DIR=${MINIWOB_HTML_DIR:-/app/miniwob-plusplus/miniwob/html}
|
| 5 |
+
MINIWOB_HTTP_PORT=${MINIWOB_HTTP_PORT:-8888}
|
| 6 |
+
BROWSERGYM_PORT=${BROWSERGYM_PORT:-8000}
|
| 7 |
+
|
| 8 |
+
if [ ! -d "${MINIWOB_HTML_DIR}" ]; then
|
| 9 |
+
echo "MiniWoB HTML directory not found at ${MINIWOB_HTML_DIR}" >&2
|
| 10 |
+
exit 1
|
| 11 |
+
fi
|
| 12 |
+
|
| 13 |
+
python -m http.server "${MINIWOB_HTTP_PORT}" --bind 0.0.0.0 --directory "${MINIWOB_HTML_DIR}" &
|
| 14 |
+
HTTP_SERVER_PID=$!
|
| 15 |
+
|
| 16 |
+
sleep 1
|
| 17 |
+
if ! kill -0 "${HTTP_SERVER_PID}" 2>/dev/null; then
|
| 18 |
+
echo "Failed to start MiniWoB static server on port ${MINIWOB_HTTP_PORT}" >&2
|
| 19 |
+
exit 1
|
| 20 |
+
fi
|
| 21 |
+
|
| 22 |
+
cleanup() {
|
| 23 |
+
kill "${HTTP_SERVER_PID}" 2>/dev/null || true
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
trap cleanup EXIT INT TERM
|
| 27 |
+
|
| 28 |
+
exec python -m uvicorn envs.browsergym_env.server.app:app --host 0.0.0.0 --port "${BROWSERGYM_PORT}"
|
| 29 |
+
|