Salil-IND commited on
Commit
ac41c96
·
verified ·
1 Parent(s): 48ff810
email-triage-env/Dockerfile DELETED
@@ -1,19 +0,0 @@
1
- FROM python:3.10-slim
2
-
3
- # Create a non-root user (important for HF Spaces)
4
- RUN useradd -m -u 1000 user
5
-
6
- USER user
7
- ENV HOME=/home/user \
8
- PATH=/home/user/.local/bin:$PATH
9
-
10
- WORKDIR $HOME/app
11
-
12
- COPY --chown=user requirements.txt .
13
- RUN pip install --no-cache-dir -r requirements.txt
14
-
15
- COPY --chown=user . .
16
-
17
- EXPOSE 7860
18
-
19
- CMD ["uvicorn", "server.app:app", "--host", "0.0.0.0", "--port", "7860"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
email-triage-env/README.md DELETED
@@ -1,18 +0,0 @@
1
- # Email Triage OpenEnv
2
-
3
- A real-world OpenEnv simulator that tests an agent's ability to efficiently manage an inbox through replying, forwarding, archiving, and identifying spam with constraints and SLA tracking.
4
-
5
- ## Structure
6
- - `env/`: Environment logic and pydantic models
7
- - `server/`: FastAPI server wrapper
8
- - `inference.py`: Standard inference script connecting to OpenAI LLMs.
9
-
10
- ## Deployment (Hugging Face Spaces Compatible)
11
- Run with Docker:
12
- `docker build -t email-env .`
13
- `docker run -p 7860:7860 email-env`
14
-
15
- Validate:
16
- `openenv validate`
17
-
18
- Start testing APIs on 0.0.0.0:7860.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
email-triage-env/env/__init__.py DELETED
@@ -1 +0,0 @@
1
- # Email Triage Environment
 
 
email-triage-env/env/environment.py DELETED
@@ -1,66 +0,0 @@
1
- from typing import Dict, Any, Tuple
2
- from .models import State, Action, Observation
3
- from .tasks import get_initial_state
4
- from .graders import grade_action
5
-
6
- class EmailTriageEnv:
7
- def __init__(self):
8
- self._state: State = None
9
- self.current_task: str = "easy"
10
-
11
- async def reset(self, task: str = "easy") -> Tuple[Observation, Dict[str, Any]]:
12
- self.current_task = task
13
- self._state = get_initial_state(task)
14
- return self._state.observation, {}
15
-
16
- async def step(self, action_dict: dict) -> Tuple[Observation, float, bool, Dict[str, Any]]:
17
- if self._state is None or self._state.is_done:
18
- obs = self._state.observation if self._state else None
19
- return obs, 0.0, True, {"error": "Environment must be reset before stepping"}
20
-
21
- try:
22
- action = Action(**action_dict)
23
- except Exception as e:
24
- self._state.step_count += 1
25
- if self._state.step_count >= self._state.max_steps:
26
- self._state.is_done = True
27
- return self._state.observation, 0.0, self._state.is_done, {"error": f"Invalid action format: {str(e)}"}
28
-
29
- self._state.step_count += 1
30
-
31
- email_to_process = None
32
- for i, email in enumerate(self._state.observation.inbox):
33
- if email.id == action.email_id:
34
- email_to_process = self._state.observation.inbox.pop(i)
35
- break
36
-
37
- if not email_to_process:
38
- self._state.is_done = len(self._state.observation.inbox) == 0 or self._state.step_count >= self._state.max_steps
39
- return self._state.observation, 0.0, self._state.is_done, {"error": "Email ID not found in inbox"}
40
-
41
- reward = grade_action(self.current_task, action, email_to_process, self._state)
42
- reward = max(0.0, min(1.0, reward))
43
- self._state.score = max(0.0, min(1.0, self._state.score + reward))
44
-
45
- if action.action_type == "reply":
46
- self._state.observation.replied.append(email_to_process)
47
- elif action.action_type == "forward":
48
- self._state.observation.forwarded.append(email_to_process)
49
- elif action.action_type == "archive":
50
- self._state.observation.archived.append(email_to_process)
51
- elif action.action_type == "mark_spam":
52
- self._state.observation.spam.append(email_to_process)
53
- elif action.action_type == "request_info":
54
- self._state.observation.pending_info.append(email_to_process)
55
- elif action.action_type == "escalate":
56
- self._state.observation.escalated.append(email_to_process)
57
-
58
- if len(self._state.observation.inbox) == 0 or self._state.step_count >= self._state.max_steps:
59
- self._state.is_done = True
60
-
61
- return self._state.observation, reward, self._state.is_done, {}
62
-
63
- def state(self) -> State:
64
- if self._state is None:
65
- self._state = get_initial_state("easy")
66
- return self._state
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
email-triage-env/env/graders.py DELETED
@@ -1,106 +0,0 @@
1
- from .models import Action, Email, State
2
- from .reward import compute_dense_reward
3
-
4
- def grade_easy(action: Action, email: Email) -> float:
5
- if email.id == "e1":
6
- if action.action_type == "reply":
7
- return compute_dense_reward(True)
8
- return compute_dense_reward(False)
9
- if email.id == "e2":
10
- if action.action_type == "mark_spam":
11
- return compute_dense_reward(True)
12
- return compute_dense_reward(False)
13
- return compute_dense_reward(False)
14
-
15
- def grade_medium(action: Action, email: Email) -> float:
16
- if email.id == "m1":
17
- # Missing order ID logic -> should request info
18
- if action.action_type == "request_info":
19
- return compute_dense_reward(True)
20
- elif action.action_type == "reply" and action.response_text:
21
- if "order" in action.response_text.lower() and "id" in action.response_text.lower():
22
- return compute_dense_reward(True)
23
- return compute_dense_reward(True) * 0.5
24
- elif action.action_type == "forward" and action.forward_to and "support" in action.forward_to.lower():
25
- # Escalating an incomplete ticket rather than asking for info gets partial score
26
- return compute_dense_reward(True) * 0.3
27
- return compute_dense_reward(False)
28
- elif email.id == "m2":
29
- if action.action_type in ["mark_spam", "archive"]:
30
- return compute_dense_reward(True)
31
- return compute_dense_reward(False)
32
- elif email.id == "m3":
33
- if action.action_type == "reply" and action.response_text and len(action.response_text) > 5:
34
- return compute_dense_reward(True)
35
- elif action.action_type == "reply":
36
- return compute_dense_reward(True) * 0.5
37
- return compute_dense_reward(False)
38
- return compute_dense_reward(False)
39
-
40
- def grade_hard(action: Action, email: Email) -> float:
41
- if email.id == "h1": # VIP Issue
42
- # We need reply containing "investigating" or "downtime"
43
- base = 0.0
44
- if action.action_type == "reply":
45
- if action.response_text and "investigat" in action.response_text.lower():
46
- base = 1.0
47
- else:
48
- base = 0.5
49
- elif action.action_type == "escalate":
50
- if action.priority_level == "urgent":
51
- base = 1.0
52
- else:
53
- base = 0.7
54
- return compute_dense_reward(True) * base
55
-
56
- elif email.id == "h2": # Alert
57
- if action.action_type == "escalate":
58
- if action.priority_level == "urgent":
59
- return compute_dense_reward(True)
60
- return compute_dense_reward(True) * 0.7
61
- elif action.action_type == "forward" and action.forward_to and "eng" in action.forward_to.lower():
62
- if action.priority_level == "urgent":
63
- return compute_dense_reward(True)
64
- return compute_dense_reward(True) * 0.5
65
- return compute_dense_reward(False)
66
-
67
- elif email.id == "h3": # Compliance
68
- if action.action_type == "reply":
69
- if action.response_text and "acknowledge" in action.response_text.lower():
70
- return compute_dense_reward(True)
71
- return compute_dense_reward(True) * 0.4
72
- elif action.action_type == "forward" and action.forward_to and "legal" in action.forward_to.lower():
73
- return compute_dense_reward(True)
74
- return compute_dense_reward(False)
75
-
76
- elif email.id == "h4": # CEO Phish Attack
77
- if action.action_type == "mark_spam":
78
- return compute_dense_reward(True)
79
- elif action.action_type == "archive" or action.action_type == "escalate":
80
- return compute_dense_reward(True) * 0.5
81
- return compute_dense_reward(False)
82
-
83
- elif email.id == "h5": # Intern project
84
- if action.action_type == "reply":
85
- return compute_dense_reward(True)
86
- return compute_dense_reward(False)
87
-
88
- return compute_dense_reward(False)
89
-
90
- def grade_action(task_level: str, action: Action, email: Email, state: State) -> float:
91
- tasks_sizes = {"easy": 2, "medium": 3, "hard": 5}
92
- size = tasks_sizes.get(task_level, 1)
93
-
94
- if task_level == "easy":
95
- r = grade_easy(action, email)
96
- elif task_level == "medium":
97
- r = grade_medium(action, email)
98
- elif task_level == "hard":
99
- r = grade_hard(action, email)
100
- else:
101
- r = 0.0
102
-
103
- # Strictly bound reward between 0.0 and 1.0 for this step
104
- # Max episode score stays bounded since pop removes 1 email, at max 1 for each email / sizes = 1.0 total max
105
- scaled_reward = max(0.0, min(1.0, r / size))
106
- return scaled_reward
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
email-triage-env/env/models.py DELETED
@@ -1,32 +0,0 @@
1
- from pydantic import BaseModel, Field
2
- from typing import List, Optional, Dict, Any
3
-
4
- class Email(BaseModel):
5
- id: str
6
- sender: str
7
- subject: str
8
- body: str
9
- metadata: Dict[str, Any] = Field(default_factory=dict, description="Metadata like SLA, tags, or threat-level.")
10
-
11
- class Observation(BaseModel):
12
- inbox: List[Email]
13
- archived: List[Email]
14
- replied: List[Email]
15
- forwarded: List[Email]
16
- spam: List[Email]
17
- escalated: List[Email]
18
- pending_info: List[Email]
19
-
20
- class Action(BaseModel):
21
- action_type: str = Field(..., description="'reply', 'forward', 'archive', 'mark_spam', 'request_info', or 'escalate'")
22
- email_id: str
23
- response_text: Optional[str] = None
24
- forward_to: Optional[str] = None
25
- priority_level: str = Field(default="normal", description="'urgent', 'normal', or 'low'")
26
-
27
- class State(BaseModel):
28
- step_count: int
29
- max_steps: int
30
- score: float
31
- is_done: bool
32
- observation: Observation
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
email-triage-env/env/reward.py DELETED
@@ -1,2 +0,0 @@
1
- def compute_dense_reward(correct_action: bool) -> float:
2
- return 1.0 if correct_action else 0.0
 
 
 
email-triage-env/env/tasks.py DELETED
@@ -1,58 +0,0 @@
1
- from pydantic import BaseModel
2
- from typing import List
3
- from .models import Email, Observation, State
4
-
5
- class TaskConfig(BaseModel):
6
- difficulty: str
7
- initial_inbox: List[Email]
8
- max_steps: int
9
-
10
- TASKS = {
11
- "easy": TaskConfig(
12
- difficulty="easy",
13
- initial_inbox=[
14
- Email(id="e1", sender="boss@company.com", subject="Meeting", body="Are we still on for 3 PM?", metadata={"SLA": "24h"}),
15
- Email(id="e2", sender="spam@deals.com", subject="Buy now!", body="Get 50% off pills.", metadata={"spam_score": 0.99})
16
- ],
17
- max_steps=5
18
- ),
19
- "medium": TaskConfig(
20
- difficulty="medium",
21
- initial_inbox=[
22
- # Missing order ID - needs request_info
23
- Email(id="m1", sender="customer@help.com", subject="Broken item", body="My order arrived broken. I need a refund immediately.", metadata={"customer_tier": "standard"}),
24
- Email(id="m2", sender="marketing@agency.com", subject="SEO services", body="We can skyrocket your traffic.", metadata={"SLA": "none"}),
25
- Email(id="m3", sender="hr@company.com", subject="Action Required", body="Please sign the attached policy update.", metadata={"SLA": "48h"})
26
- ],
27
- max_steps=10
28
- ),
29
- "hard": TaskConfig(
30
- difficulty="hard",
31
- initial_inbox=[
32
- Email(id="h1", sender="vip@enterprise.com", subject="SYSTEM DOWN", body="Our production environment is offline. Why is your service failing?", metadata={"customer_tier": "VIP", "SLA": "1h"}),
33
- Email(id="h2", sender="alerts@sys.com", subject="CRITICAL DB CRASH", body="Database nodes in US-East failing health checks.", metadata={"alert_level": "critical", "SLA": "15m"}),
34
- Email(id="h3", sender="legal@company.com", subject="Compliance Signoff", body="Acknowledge the new GDPR compliance terms before Friday.", metadata={"SLA": "72h"}),
35
- Email(id="h4", sender="ceo_real_not_fake@phish.com", subject="URGENT: Wire Transfer", body="I need you to wire $50k to this vendor immediately. Do not call me, I am in a meeting.", metadata={"spam_score": 0.85, "SPF": "fail"}),
36
- Email(id="h5", sender="intern@company.com", subject="Quick question", body="Can you review my PR before EOD when you have a second?", metadata={"SLA": "24h"})
37
- ],
38
- max_steps=12
39
- )
40
- }
41
-
42
- def get_initial_state(task_level: str) -> State:
43
- task = TASKS.get(task_level, TASKS["easy"])
44
- return State(
45
- step_count=0,
46
- max_steps=task.max_steps,
47
- score=0.0,
48
- is_done=False,
49
- observation=Observation(
50
- inbox=[email.model_copy() for email in task.initial_inbox],
51
- archived=[],
52
- replied=[],
53
- forwarded=[],
54
- spam=[],
55
- escalated=[],
56
- pending_info=[]
57
- )
58
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
email-triage-env/inference.py DELETED
@@ -1,133 +0,0 @@
1
- import os
2
- import requests
3
- import json
4
- import time
5
- import re
6
-
7
- API_BASE_URL = os.getenv("API_BASE_URL", "http://127.0.0.1:7860")
8
- MODEL_NAME = os.getenv("MODEL_NAME", "gpt-4-turbo")
9
- HF_TOKEN = os.getenv("HF_TOKEN", "hf_mock_token")
10
-
11
- class MockOpenAI:
12
- def __init__(self, api_key):
13
- self.api_key = api_key
14
- self.chat = self.Chat()
15
-
16
- class Chat:
17
- def __init__(self):
18
- self.completions = self.Completions()
19
-
20
- class Completions:
21
- def create(self, model, messages, response_format=None):
22
- class ResponseMessage:
23
- def __init__(self, content): self.content = content
24
- class ResponseChoice:
25
- def __init__(self, message): self.message = message
26
- class Response:
27
- def __init__(self, choices): self.choices = choices
28
-
29
- content = str(messages[-1]["content"]).lower()
30
- action = {"action_type": "reply", "email_id": "", "response_text": "Noted.", "forward_to": None, "priority_level": "normal"}
31
-
32
- match = re.search(r"'id': '([^']+)'", str(messages))
33
- if match: action["email_id"] = match.group(1)
34
-
35
- if "spam" in content or "wire" in content or "password" in content or "phish" in content or "traffic" in content:
36
- action["action_type"] = "mark_spam"
37
- elif "broken" in content and "order" in content and "id" not in content:
38
- action["action_type"] = "request_info"
39
- action["response_text"] = "Please provide your order ID."
40
- elif "downtime" in content or "offline" in content or "fail" in content:
41
- if "vip" in content:
42
- action["action_type"] = "reply"
43
- action["priority_level"] = "urgent"
44
- action["response_text"] = "We are currently investigating the downtime."
45
- else:
46
- action["action_type"] = "escalate"
47
- action["priority_level"] = "urgent"
48
- elif "compliance" in content:
49
- action["action_type"] = "reply"
50
- action["response_text"] = "I acknowledge the terms."
51
-
52
- return Response([ResponseChoice(ResponseMessage(json.dumps(action)))])
53
-
54
- try:
55
- from openai import OpenAI
56
- client = OpenAI(api_key=os.getenv("OPENAI_API_KEY", "default-key"), base_url=os.getenv("OPENAI_BASE_URL", None))
57
- except ImportError:
58
- client = MockOpenAI(api_key="mock")
59
-
60
- def get_action_from_llm(state_observation):
61
- system_prompt = """You are an elite email triage AI agent. Make fast, precise triage decisions based on email bodies, subjects, and metadata schemas like SLA or threat-level properties.
62
- You must choose one of these actions: reply, forward, archive, mark_spam, request_info, escalate.
63
- If you respond, ensure response_text handles context constraints professionally.
64
- If you deal with critical downtime metadata, utilize escalate with urgent priority_level.
65
- Return ONLY valid JSON: {"action_type": "...", "email_id": "...", "forward_to": "...", "response_text": "...", "priority_level": "..."}"""
66
-
67
- messages = [
68
- {"role": "system", "content": system_prompt},
69
- {"role": "user", "content": f"Current Inbox Observation: {json.dumps(state_observation)}"}
70
- ]
71
-
72
- response = client.chat.completions.create(
73
- model=MODEL_NAME,
74
- messages=messages,
75
- response_format={"type": "json_object"}
76
- )
77
- try:
78
- data = json.loads(response.choices[0].message.content)
79
- return data
80
- except:
81
- inbox = state_observation.get("inbox", [])
82
- if inbox: return {"action_type": "archive", "email_id": inbox[0]["id"], "priority_level": "normal"}
83
- return {"action_type": "archive", "email_id": "", "priority_level": "normal"}
84
-
85
- def run_inference(task="easy"):
86
- print(f"[START] task={task} env=email-triage-env model={MODEL_NAME}")
87
- try:
88
- resp = requests.post(f"{API_BASE_URL}/reset", json={"task": task})
89
- resp.raise_for_status()
90
- res = resp.json()
91
- observation = res.get("observation", {})
92
- except Exception as e:
93
- print(f"Error resetting env: {e}")
94
- return
95
-
96
- done = False
97
- step = 0
98
- rewards_list = []
99
-
100
- while not done:
101
- step += 1
102
- inbox = observation.get("inbox", [])
103
- if not inbox: break
104
-
105
- action = get_action_from_llm(observation)
106
- if action.get("email_id") not in [e["id"] for e in inbox]:
107
- action["email_id"] = inbox[0]["id"]
108
-
109
- try:
110
- resp = requests.post(f"{API_BASE_URL}/step", json={"action": action})
111
- resp.raise_for_status()
112
- res = resp.json()
113
- except: break
114
-
115
- observation = res.get("observation", {}) or {}
116
- reward = res.get("reward", 0.0)
117
- done = res.get("done", True)
118
- info = res.get("info", {})
119
- error_msg = info.get("error") if isinstance(info, dict) else None
120
-
121
- rewards_list.append(reward)
122
- err_str = "null" if error_msg is None else f'"{error_msg}"'
123
- print(f"[STEP] step={step} action={json.dumps(action)} reward={reward:.2f} done={str(done).lower()} error={err_str}")
124
-
125
- total_score = sum(rewards_list)
126
- rewards_str = ",".join([f"{r:.2f}" for r in rewards_list])
127
- print(f"[END] success=true steps={step} score={total_score:.2f} rewards={rewards_str}")
128
-
129
- if __name__ == "__main__":
130
- time.sleep(1)
131
- run_inference("easy")
132
- run_inference("medium")
133
- run_inference("hard")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
email-triage-env/openenv.yaml DELETED
@@ -1,8 +0,0 @@
1
- name: email-triage-env
2
- version: 1.0.0
3
- description: "Email triage environment evaluating an agent's capability to process inbox efficiently."
4
- entrypoint: "server/app.py"
5
- tasks:
6
- - easy
7
- - medium
8
- - hard
 
 
 
 
 
 
 
 
 
email-triage-env/requirements.txt DELETED
@@ -1,4 +0,0 @@
1
- fastapi==0.104.1
2
- uvicorn==0.24.0.post1
3
- pydantic==2.5.0
4
- requests==2.31.0
 
 
 
 
 
email-triage-env/server/app.py DELETED
@@ -1,36 +0,0 @@
1
- from fastapi import FastAPI
2
- from pydantic import BaseModel, Field
3
- from typing import Dict, Any
4
- from env.environment import EmailTriageEnv
5
-
6
- app = FastAPI()
7
- env = EmailTriageEnv()
8
-
9
- class ResetRequest(BaseModel):
10
- task: str = "easy"
11
-
12
- class StepRequest(BaseModel):
13
- action: Dict[str, Any] = Field(default_factory=dict)
14
-
15
- @app.post("/reset")
16
- async def reset_env(req: ResetRequest):
17
- obs, info = await env.reset(req.task)
18
- return {
19
- "observation": obs.model_dump(),
20
- "info": info
21
- }
22
-
23
- @app.post("/step")
24
- async def step_env(req: StepRequest):
25
- obs, reward, done, info = await env.step(req.action)
26
- return {
27
- "observation": obs.model_dump() if obs else None,
28
- "reward": float(reward),
29
- "done": bool(done),
30
- "info": info
31
- }
32
-
33
- @app.get("/state")
34
- async def get_state():
35
- state = env.state()
36
- return state.model_dump()