Spaces:
Sleeping
Sleeping
Second Commit
Browse files- Dockerfile +19 -0
- README.md +18 -10
- env/__init__.py +1 -0
- env/environment.py +66 -0
- env/graders.py +106 -0
- env/models.py +32 -0
- env/reward.py +2 -0
- env/tasks.py +58 -0
- inference.py +133 -0
- openenv.yaml +8 -0
- requirements.txt +4 -0
- server/app.py +36 -0
Dockerfile
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.10-slim
|
| 2 |
+
|
| 3 |
+
# Create a non-root user (important for HF Spaces)
|
| 4 |
+
RUN useradd -m -u 1000 user
|
| 5 |
+
|
| 6 |
+
USER user
|
| 7 |
+
ENV HOME=/home/user \
|
| 8 |
+
PATH=/home/user/.local/bin:$PATH
|
| 9 |
+
|
| 10 |
+
WORKDIR $HOME/app
|
| 11 |
+
|
| 12 |
+
COPY --chown=user requirements.txt .
|
| 13 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 14 |
+
|
| 15 |
+
COPY --chown=user . .
|
| 16 |
+
|
| 17 |
+
EXPOSE 7860
|
| 18 |
+
|
| 19 |
+
CMD ["uvicorn", "server.app:app", "--host", "0.0.0.0", "--port", "7860"]
|
README.md
CHANGED
|
@@ -1,10 +1,18 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
-
|
| 9 |
-
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Email Triage OpenEnv
|
| 2 |
+
|
| 3 |
+
A real-world OpenEnv simulator that tests an agent's ability to efficiently manage an inbox through replying, forwarding, archiving, and identifying spam with constraints and SLA tracking.
|
| 4 |
+
|
| 5 |
+
## Structure
|
| 6 |
+
- `env/`: Environment logic and pydantic models
|
| 7 |
+
- `server/`: FastAPI server wrapper
|
| 8 |
+
- `inference.py`: Standard inference script connecting to OpenAI LLMs.
|
| 9 |
+
|
| 10 |
+
## Deployment (Hugging Face Spaces Compatible)
|
| 11 |
+
Run with Docker:
|
| 12 |
+
`docker build -t email-env .`
|
| 13 |
+
`docker run -p 7860:7860 email-env`
|
| 14 |
+
|
| 15 |
+
Validate:
|
| 16 |
+
`openenv validate`
|
| 17 |
+
|
| 18 |
+
Start testing APIs on 0.0.0.0:7860.
|
env/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# Email Triage Environment
|
env/environment.py
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Dict, Any, Tuple
|
| 2 |
+
from .models import State, Action, Observation
|
| 3 |
+
from .tasks import get_initial_state
|
| 4 |
+
from .graders import grade_action
|
| 5 |
+
|
| 6 |
+
class EmailTriageEnv:
|
| 7 |
+
def __init__(self):
|
| 8 |
+
self._state: State = None
|
| 9 |
+
self.current_task: str = "easy"
|
| 10 |
+
|
| 11 |
+
async def reset(self, task: str = "easy") -> Tuple[Observation, Dict[str, Any]]:
|
| 12 |
+
self.current_task = task
|
| 13 |
+
self._state = get_initial_state(task)
|
| 14 |
+
return self._state.observation, {}
|
| 15 |
+
|
| 16 |
+
async def step(self, action_dict: dict) -> Tuple[Observation, float, bool, Dict[str, Any]]:
|
| 17 |
+
if self._state is None or self._state.is_done:
|
| 18 |
+
obs = self._state.observation if self._state else None
|
| 19 |
+
return obs, 0.0, True, {"error": "Environment must be reset before stepping"}
|
| 20 |
+
|
| 21 |
+
try:
|
| 22 |
+
action = Action(**action_dict)
|
| 23 |
+
except Exception as e:
|
| 24 |
+
self._state.step_count += 1
|
| 25 |
+
if self._state.step_count >= self._state.max_steps:
|
| 26 |
+
self._state.is_done = True
|
| 27 |
+
return self._state.observation, 0.0, self._state.is_done, {"error": f"Invalid action format: {str(e)}"}
|
| 28 |
+
|
| 29 |
+
self._state.step_count += 1
|
| 30 |
+
|
| 31 |
+
email_to_process = None
|
| 32 |
+
for i, email in enumerate(self._state.observation.inbox):
|
| 33 |
+
if email.id == action.email_id:
|
| 34 |
+
email_to_process = self._state.observation.inbox.pop(i)
|
| 35 |
+
break
|
| 36 |
+
|
| 37 |
+
if not email_to_process:
|
| 38 |
+
self._state.is_done = len(self._state.observation.inbox) == 0 or self._state.step_count >= self._state.max_steps
|
| 39 |
+
return self._state.observation, 0.0, self._state.is_done, {"error": "Email ID not found in inbox"}
|
| 40 |
+
|
| 41 |
+
reward = grade_action(self.current_task, action, email_to_process, self._state)
|
| 42 |
+
reward = max(0.0, min(1.0, reward))
|
| 43 |
+
self._state.score = max(0.0, min(1.0, self._state.score + reward))
|
| 44 |
+
|
| 45 |
+
if action.action_type == "reply":
|
| 46 |
+
self._state.observation.replied.append(email_to_process)
|
| 47 |
+
elif action.action_type == "forward":
|
| 48 |
+
self._state.observation.forwarded.append(email_to_process)
|
| 49 |
+
elif action.action_type == "archive":
|
| 50 |
+
self._state.observation.archived.append(email_to_process)
|
| 51 |
+
elif action.action_type == "mark_spam":
|
| 52 |
+
self._state.observation.spam.append(email_to_process)
|
| 53 |
+
elif action.action_type == "request_info":
|
| 54 |
+
self._state.observation.pending_info.append(email_to_process)
|
| 55 |
+
elif action.action_type == "escalate":
|
| 56 |
+
self._state.observation.escalated.append(email_to_process)
|
| 57 |
+
|
| 58 |
+
if len(self._state.observation.inbox) == 0 or self._state.step_count >= self._state.max_steps:
|
| 59 |
+
self._state.is_done = True
|
| 60 |
+
|
| 61 |
+
return self._state.observation, reward, self._state.is_done, {}
|
| 62 |
+
|
| 63 |
+
def state(self) -> State:
|
| 64 |
+
if self._state is None:
|
| 65 |
+
self._state = get_initial_state("easy")
|
| 66 |
+
return self._state
|
env/graders.py
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .models import Action, Email, State
|
| 2 |
+
from .reward import compute_dense_reward
|
| 3 |
+
|
| 4 |
+
def grade_easy(action: Action, email: Email) -> float:
|
| 5 |
+
if email.id == "e1":
|
| 6 |
+
if action.action_type == "reply":
|
| 7 |
+
return compute_dense_reward(True)
|
| 8 |
+
return compute_dense_reward(False)
|
| 9 |
+
if email.id == "e2":
|
| 10 |
+
if action.action_type == "mark_spam":
|
| 11 |
+
return compute_dense_reward(True)
|
| 12 |
+
return compute_dense_reward(False)
|
| 13 |
+
return compute_dense_reward(False)
|
| 14 |
+
|
| 15 |
+
def grade_medium(action: Action, email: Email) -> float:
|
| 16 |
+
if email.id == "m1":
|
| 17 |
+
# Missing order ID logic -> should request info
|
| 18 |
+
if action.action_type == "request_info":
|
| 19 |
+
return compute_dense_reward(True)
|
| 20 |
+
elif action.action_type == "reply" and action.response_text:
|
| 21 |
+
if "order" in action.response_text.lower() and "id" in action.response_text.lower():
|
| 22 |
+
return compute_dense_reward(True)
|
| 23 |
+
return compute_dense_reward(True) * 0.5
|
| 24 |
+
elif action.action_type == "forward" and action.forward_to and "support" in action.forward_to.lower():
|
| 25 |
+
# Escalating an incomplete ticket rather than asking for info gets partial score
|
| 26 |
+
return compute_dense_reward(True) * 0.3
|
| 27 |
+
return compute_dense_reward(False)
|
| 28 |
+
elif email.id == "m2":
|
| 29 |
+
if action.action_type in ["mark_spam", "archive"]:
|
| 30 |
+
return compute_dense_reward(True)
|
| 31 |
+
return compute_dense_reward(False)
|
| 32 |
+
elif email.id == "m3":
|
| 33 |
+
if action.action_type == "reply" and action.response_text and len(action.response_text) > 5:
|
| 34 |
+
return compute_dense_reward(True)
|
| 35 |
+
elif action.action_type == "reply":
|
| 36 |
+
return compute_dense_reward(True) * 0.5
|
| 37 |
+
return compute_dense_reward(False)
|
| 38 |
+
return compute_dense_reward(False)
|
| 39 |
+
|
| 40 |
+
def grade_hard(action: Action, email: Email) -> float:
|
| 41 |
+
if email.id == "h1": # VIP Issue
|
| 42 |
+
# We need reply containing "investigating" or "downtime"
|
| 43 |
+
base = 0.0
|
| 44 |
+
if action.action_type == "reply":
|
| 45 |
+
if action.response_text and "investigat" in action.response_text.lower():
|
| 46 |
+
base = 1.0
|
| 47 |
+
else:
|
| 48 |
+
base = 0.5
|
| 49 |
+
elif action.action_type == "escalate":
|
| 50 |
+
if action.priority_level == "urgent":
|
| 51 |
+
base = 1.0
|
| 52 |
+
else:
|
| 53 |
+
base = 0.7
|
| 54 |
+
return compute_dense_reward(True) * base
|
| 55 |
+
|
| 56 |
+
elif email.id == "h2": # Alert
|
| 57 |
+
if action.action_type == "escalate":
|
| 58 |
+
if action.priority_level == "urgent":
|
| 59 |
+
return compute_dense_reward(True)
|
| 60 |
+
return compute_dense_reward(True) * 0.7
|
| 61 |
+
elif action.action_type == "forward" and action.forward_to and "eng" in action.forward_to.lower():
|
| 62 |
+
if action.priority_level == "urgent":
|
| 63 |
+
return compute_dense_reward(True)
|
| 64 |
+
return compute_dense_reward(True) * 0.5
|
| 65 |
+
return compute_dense_reward(False)
|
| 66 |
+
|
| 67 |
+
elif email.id == "h3": # Compliance
|
| 68 |
+
if action.action_type == "reply":
|
| 69 |
+
if action.response_text and "acknowledge" in action.response_text.lower():
|
| 70 |
+
return compute_dense_reward(True)
|
| 71 |
+
return compute_dense_reward(True) * 0.4
|
| 72 |
+
elif action.action_type == "forward" and action.forward_to and "legal" in action.forward_to.lower():
|
| 73 |
+
return compute_dense_reward(True)
|
| 74 |
+
return compute_dense_reward(False)
|
| 75 |
+
|
| 76 |
+
elif email.id == "h4": # CEO Phish Attack
|
| 77 |
+
if action.action_type == "mark_spam":
|
| 78 |
+
return compute_dense_reward(True)
|
| 79 |
+
elif action.action_type == "archive" or action.action_type == "escalate":
|
| 80 |
+
return compute_dense_reward(True) * 0.5
|
| 81 |
+
return compute_dense_reward(False)
|
| 82 |
+
|
| 83 |
+
elif email.id == "h5": # Intern project
|
| 84 |
+
if action.action_type == "reply":
|
| 85 |
+
return compute_dense_reward(True)
|
| 86 |
+
return compute_dense_reward(False)
|
| 87 |
+
|
| 88 |
+
return compute_dense_reward(False)
|
| 89 |
+
|
| 90 |
+
def grade_action(task_level: str, action: Action, email: Email, state: State) -> float:
|
| 91 |
+
tasks_sizes = {"easy": 2, "medium": 3, "hard": 5}
|
| 92 |
+
size = tasks_sizes.get(task_level, 1)
|
| 93 |
+
|
| 94 |
+
if task_level == "easy":
|
| 95 |
+
r = grade_easy(action, email)
|
| 96 |
+
elif task_level == "medium":
|
| 97 |
+
r = grade_medium(action, email)
|
| 98 |
+
elif task_level == "hard":
|
| 99 |
+
r = grade_hard(action, email)
|
| 100 |
+
else:
|
| 101 |
+
r = 0.0
|
| 102 |
+
|
| 103 |
+
# Strictly bound reward between 0.0 and 1.0 for this step
|
| 104 |
+
# Max episode score stays bounded since pop removes 1 email, at max 1 for each email / sizes = 1.0 total max
|
| 105 |
+
scaled_reward = max(0.0, min(1.0, r / size))
|
| 106 |
+
return scaled_reward
|
env/models.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pydantic import BaseModel, Field
|
| 2 |
+
from typing import List, Optional, Dict, Any
|
| 3 |
+
|
| 4 |
+
class Email(BaseModel):
|
| 5 |
+
id: str
|
| 6 |
+
sender: str
|
| 7 |
+
subject: str
|
| 8 |
+
body: str
|
| 9 |
+
metadata: Dict[str, Any] = Field(default_factory=dict, description="Metadata like SLA, tags, or threat-level.")
|
| 10 |
+
|
| 11 |
+
class Observation(BaseModel):
|
| 12 |
+
inbox: List[Email]
|
| 13 |
+
archived: List[Email]
|
| 14 |
+
replied: List[Email]
|
| 15 |
+
forwarded: List[Email]
|
| 16 |
+
spam: List[Email]
|
| 17 |
+
escalated: List[Email]
|
| 18 |
+
pending_info: List[Email]
|
| 19 |
+
|
| 20 |
+
class Action(BaseModel):
|
| 21 |
+
action_type: str = Field(..., description="'reply', 'forward', 'archive', 'mark_spam', 'request_info', or 'escalate'")
|
| 22 |
+
email_id: str
|
| 23 |
+
response_text: Optional[str] = None
|
| 24 |
+
forward_to: Optional[str] = None
|
| 25 |
+
priority_level: str = Field(default="normal", description="'urgent', 'normal', or 'low'")
|
| 26 |
+
|
| 27 |
+
class State(BaseModel):
|
| 28 |
+
step_count: int
|
| 29 |
+
max_steps: int
|
| 30 |
+
score: float
|
| 31 |
+
is_done: bool
|
| 32 |
+
observation: Observation
|
env/reward.py
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
def compute_dense_reward(correct_action: bool) -> float:
|
| 2 |
+
return 1.0 if correct_action else 0.0
|
env/tasks.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pydantic import BaseModel
|
| 2 |
+
from typing import List
|
| 3 |
+
from .models import Email, Observation, State
|
| 4 |
+
|
| 5 |
+
class TaskConfig(BaseModel):
|
| 6 |
+
difficulty: str
|
| 7 |
+
initial_inbox: List[Email]
|
| 8 |
+
max_steps: int
|
| 9 |
+
|
| 10 |
+
TASKS = {
|
| 11 |
+
"easy": TaskConfig(
|
| 12 |
+
difficulty="easy",
|
| 13 |
+
initial_inbox=[
|
| 14 |
+
Email(id="e1", sender="boss@company.com", subject="Meeting", body="Are we still on for 3 PM?", metadata={"SLA": "24h"}),
|
| 15 |
+
Email(id="e2", sender="spam@deals.com", subject="Buy now!", body="Get 50% off pills.", metadata={"spam_score": 0.99})
|
| 16 |
+
],
|
| 17 |
+
max_steps=5
|
| 18 |
+
),
|
| 19 |
+
"medium": TaskConfig(
|
| 20 |
+
difficulty="medium",
|
| 21 |
+
initial_inbox=[
|
| 22 |
+
# Missing order ID - needs request_info
|
| 23 |
+
Email(id="m1", sender="customer@help.com", subject="Broken item", body="My order arrived broken. I need a refund immediately.", metadata={"customer_tier": "standard"}),
|
| 24 |
+
Email(id="m2", sender="marketing@agency.com", subject="SEO services", body="We can skyrocket your traffic.", metadata={"SLA": "none"}),
|
| 25 |
+
Email(id="m3", sender="hr@company.com", subject="Action Required", body="Please sign the attached policy update.", metadata={"SLA": "48h"})
|
| 26 |
+
],
|
| 27 |
+
max_steps=10
|
| 28 |
+
),
|
| 29 |
+
"hard": TaskConfig(
|
| 30 |
+
difficulty="hard",
|
| 31 |
+
initial_inbox=[
|
| 32 |
+
Email(id="h1", sender="vip@enterprise.com", subject="SYSTEM DOWN", body="Our production environment is offline. Why is your service failing?", metadata={"customer_tier": "VIP", "SLA": "1h"}),
|
| 33 |
+
Email(id="h2", sender="alerts@sys.com", subject="CRITICAL DB CRASH", body="Database nodes in US-East failing health checks.", metadata={"alert_level": "critical", "SLA": "15m"}),
|
| 34 |
+
Email(id="h3", sender="legal@company.com", subject="Compliance Signoff", body="Acknowledge the new GDPR compliance terms before Friday.", metadata={"SLA": "72h"}),
|
| 35 |
+
Email(id="h4", sender="ceo_real_not_fake@phish.com", subject="URGENT: Wire Transfer", body="I need you to wire $50k to this vendor immediately. Do not call me, I am in a meeting.", metadata={"spam_score": 0.85, "SPF": "fail"}),
|
| 36 |
+
Email(id="h5", sender="intern@company.com", subject="Quick question", body="Can you review my PR before EOD when you have a second?", metadata={"SLA": "24h"})
|
| 37 |
+
],
|
| 38 |
+
max_steps=12
|
| 39 |
+
)
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
def get_initial_state(task_level: str) -> State:
|
| 43 |
+
task = TASKS.get(task_level, TASKS["easy"])
|
| 44 |
+
return State(
|
| 45 |
+
step_count=0,
|
| 46 |
+
max_steps=task.max_steps,
|
| 47 |
+
score=0.0,
|
| 48 |
+
is_done=False,
|
| 49 |
+
observation=Observation(
|
| 50 |
+
inbox=[email.model_copy() for email in task.initial_inbox],
|
| 51 |
+
archived=[],
|
| 52 |
+
replied=[],
|
| 53 |
+
forwarded=[],
|
| 54 |
+
spam=[],
|
| 55 |
+
escalated=[],
|
| 56 |
+
pending_info=[]
|
| 57 |
+
)
|
| 58 |
+
)
|
inference.py
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import requests
|
| 3 |
+
import json
|
| 4 |
+
import time
|
| 5 |
+
import re
|
| 6 |
+
|
| 7 |
+
API_BASE_URL = os.getenv("API_BASE_URL", "http://127.0.0.1:7860")
|
| 8 |
+
MODEL_NAME = os.getenv("MODEL_NAME", "gpt-4-turbo")
|
| 9 |
+
HF_TOKEN = os.getenv("HF_TOKEN", "hf_mock_token")
|
| 10 |
+
|
| 11 |
+
class MockOpenAI:
|
| 12 |
+
def __init__(self, api_key):
|
| 13 |
+
self.api_key = api_key
|
| 14 |
+
self.chat = self.Chat()
|
| 15 |
+
|
| 16 |
+
class Chat:
|
| 17 |
+
def __init__(self):
|
| 18 |
+
self.completions = self.Completions()
|
| 19 |
+
|
| 20 |
+
class Completions:
|
| 21 |
+
def create(self, model, messages, response_format=None):
|
| 22 |
+
class ResponseMessage:
|
| 23 |
+
def __init__(self, content): self.content = content
|
| 24 |
+
class ResponseChoice:
|
| 25 |
+
def __init__(self, message): self.message = message
|
| 26 |
+
class Response:
|
| 27 |
+
def __init__(self, choices): self.choices = choices
|
| 28 |
+
|
| 29 |
+
content = str(messages[-1]["content"]).lower()
|
| 30 |
+
action = {"action_type": "reply", "email_id": "", "response_text": "Noted.", "forward_to": None, "priority_level": "normal"}
|
| 31 |
+
|
| 32 |
+
match = re.search(r"'id': '([^']+)'", str(messages))
|
| 33 |
+
if match: action["email_id"] = match.group(1)
|
| 34 |
+
|
| 35 |
+
if "spam" in content or "wire" in content or "password" in content or "phish" in content or "traffic" in content:
|
| 36 |
+
action["action_type"] = "mark_spam"
|
| 37 |
+
elif "broken" in content and "order" in content and "id" not in content:
|
| 38 |
+
action["action_type"] = "request_info"
|
| 39 |
+
action["response_text"] = "Please provide your order ID."
|
| 40 |
+
elif "downtime" in content or "offline" in content or "fail" in content:
|
| 41 |
+
if "vip" in content:
|
| 42 |
+
action["action_type"] = "reply"
|
| 43 |
+
action["priority_level"] = "urgent"
|
| 44 |
+
action["response_text"] = "We are currently investigating the downtime."
|
| 45 |
+
else:
|
| 46 |
+
action["action_type"] = "escalate"
|
| 47 |
+
action["priority_level"] = "urgent"
|
| 48 |
+
elif "compliance" in content:
|
| 49 |
+
action["action_type"] = "reply"
|
| 50 |
+
action["response_text"] = "I acknowledge the terms."
|
| 51 |
+
|
| 52 |
+
return Response([ResponseChoice(ResponseMessage(json.dumps(action)))])
|
| 53 |
+
|
| 54 |
+
try:
|
| 55 |
+
from openai import OpenAI
|
| 56 |
+
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY", "default-key"), base_url=os.getenv("OPENAI_BASE_URL", None))
|
| 57 |
+
except ImportError:
|
| 58 |
+
client = MockOpenAI(api_key="mock")
|
| 59 |
+
|
| 60 |
+
def get_action_from_llm(state_observation):
|
| 61 |
+
system_prompt = """You are an elite email triage AI agent. Make fast, precise triage decisions based on email bodies, subjects, and metadata schemas like SLA or threat-level properties.
|
| 62 |
+
You must choose one of these actions: reply, forward, archive, mark_spam, request_info, escalate.
|
| 63 |
+
If you respond, ensure response_text handles context constraints professionally.
|
| 64 |
+
If you deal with critical downtime metadata, utilize escalate with urgent priority_level.
|
| 65 |
+
Return ONLY valid JSON: {"action_type": "...", "email_id": "...", "forward_to": "...", "response_text": "...", "priority_level": "..."}"""
|
| 66 |
+
|
| 67 |
+
messages = [
|
| 68 |
+
{"role": "system", "content": system_prompt},
|
| 69 |
+
{"role": "user", "content": f"Current Inbox Observation: {json.dumps(state_observation)}"}
|
| 70 |
+
]
|
| 71 |
+
|
| 72 |
+
response = client.chat.completions.create(
|
| 73 |
+
model=MODEL_NAME,
|
| 74 |
+
messages=messages,
|
| 75 |
+
response_format={"type": "json_object"}
|
| 76 |
+
)
|
| 77 |
+
try:
|
| 78 |
+
data = json.loads(response.choices[0].message.content)
|
| 79 |
+
return data
|
| 80 |
+
except:
|
| 81 |
+
inbox = state_observation.get("inbox", [])
|
| 82 |
+
if inbox: return {"action_type": "archive", "email_id": inbox[0]["id"], "priority_level": "normal"}
|
| 83 |
+
return {"action_type": "archive", "email_id": "", "priority_level": "normal"}
|
| 84 |
+
|
| 85 |
+
def run_inference(task="easy"):
|
| 86 |
+
print(f"[START] task={task} env=email-triage-env model={MODEL_NAME}")
|
| 87 |
+
try:
|
| 88 |
+
resp = requests.post(f"{API_BASE_URL}/reset", json={"task": task})
|
| 89 |
+
resp.raise_for_status()
|
| 90 |
+
res = resp.json()
|
| 91 |
+
observation = res.get("observation", {})
|
| 92 |
+
except Exception as e:
|
| 93 |
+
print(f"Error resetting env: {e}")
|
| 94 |
+
return
|
| 95 |
+
|
| 96 |
+
done = False
|
| 97 |
+
step = 0
|
| 98 |
+
rewards_list = []
|
| 99 |
+
|
| 100 |
+
while not done:
|
| 101 |
+
step += 1
|
| 102 |
+
inbox = observation.get("inbox", [])
|
| 103 |
+
if not inbox: break
|
| 104 |
+
|
| 105 |
+
action = get_action_from_llm(observation)
|
| 106 |
+
if action.get("email_id") not in [e["id"] for e in inbox]:
|
| 107 |
+
action["email_id"] = inbox[0]["id"]
|
| 108 |
+
|
| 109 |
+
try:
|
| 110 |
+
resp = requests.post(f"{API_BASE_URL}/step", json={"action": action})
|
| 111 |
+
resp.raise_for_status()
|
| 112 |
+
res = resp.json()
|
| 113 |
+
except: break
|
| 114 |
+
|
| 115 |
+
observation = res.get("observation", {}) or {}
|
| 116 |
+
reward = res.get("reward", 0.0)
|
| 117 |
+
done = res.get("done", True)
|
| 118 |
+
info = res.get("info", {})
|
| 119 |
+
error_msg = info.get("error") if isinstance(info, dict) else None
|
| 120 |
+
|
| 121 |
+
rewards_list.append(reward)
|
| 122 |
+
err_str = "null" if error_msg is None else f'"{error_msg}"'
|
| 123 |
+
print(f"[STEP] step={step} action={json.dumps(action)} reward={reward:.2f} done={str(done).lower()} error={err_str}")
|
| 124 |
+
|
| 125 |
+
total_score = sum(rewards_list)
|
| 126 |
+
rewards_str = ",".join([f"{r:.2f}" for r in rewards_list])
|
| 127 |
+
print(f"[END] success=true steps={step} score={total_score:.2f} rewards={rewards_str}")
|
| 128 |
+
|
| 129 |
+
if __name__ == "__main__":
|
| 130 |
+
time.sleep(1)
|
| 131 |
+
run_inference("easy")
|
| 132 |
+
run_inference("medium")
|
| 133 |
+
run_inference("hard")
|
openenv.yaml
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: email-triage-env
|
| 2 |
+
version: 1.0.0
|
| 3 |
+
description: "Email triage environment evaluating an agent's capability to process inbox efficiently."
|
| 4 |
+
entrypoint: "server/app.py"
|
| 5 |
+
tasks:
|
| 6 |
+
- easy
|
| 7 |
+
- medium
|
| 8 |
+
- hard
|
requirements.txt
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi==0.104.1
|
| 2 |
+
uvicorn==0.24.0.post1
|
| 3 |
+
pydantic==2.5.0
|
| 4 |
+
requests==2.31.0
|
server/app.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import FastAPI
|
| 2 |
+
from pydantic import BaseModel, Field
|
| 3 |
+
from typing import Dict, Any
|
| 4 |
+
from env.environment import EmailTriageEnv
|
| 5 |
+
|
| 6 |
+
app = FastAPI()
|
| 7 |
+
env = EmailTriageEnv()
|
| 8 |
+
|
| 9 |
+
class ResetRequest(BaseModel):
|
| 10 |
+
task: str = "easy"
|
| 11 |
+
|
| 12 |
+
class StepRequest(BaseModel):
|
| 13 |
+
action: Dict[str, Any] = Field(default_factory=dict)
|
| 14 |
+
|
| 15 |
+
@app.post("/reset")
|
| 16 |
+
async def reset_env(req: ResetRequest):
|
| 17 |
+
obs, info = await env.reset(req.task)
|
| 18 |
+
return {
|
| 19 |
+
"observation": obs.model_dump(),
|
| 20 |
+
"info": info
|
| 21 |
+
}
|
| 22 |
+
|
| 23 |
+
@app.post("/step")
|
| 24 |
+
async def step_env(req: StepRequest):
|
| 25 |
+
obs, reward, done, info = await env.step(req.action)
|
| 26 |
+
return {
|
| 27 |
+
"observation": obs.model_dump() if obs else None,
|
| 28 |
+
"reward": float(reward),
|
| 29 |
+
"done": bool(done),
|
| 30 |
+
"info": info
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
@app.get("/state")
|
| 34 |
+
async def get_state():
|
| 35 |
+
state = env.state()
|
| 36 |
+
return state.model_dump()
|