Spaces:

DevanshuDon
/

exec-assist

Sleeping

App Files Files Community

DevanshuDon commited on 20 days ago

Commit

d09534e

verified ·

1 Parent(s): 722231e

Upload 8 files

Browse files

Files changed (8) hide show

server/__init__.py +1 -0
server/__pycache__/__init__.cpython-312.pyc +0 -0
server/__pycache__/app.cpython-312.pyc +0 -0
server/__pycache__/data.cpython-312.pyc +0 -0
server/__pycache__/models.cpython-312.pyc +0 -0
server/app.py +366 -0
server/data.py +670 -0
server/models.py +98 -0

server/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """ExecAssist server package."""

server/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (245 Bytes). View file

server/__pycache__/app.cpython-312.pyc ADDED Viewed

Binary file (11.4 kB). View file

server/__pycache__/data.cpython-312.pyc ADDED Viewed

Binary file (20.6 kB). View file

server/__pycache__/models.cpython-312.pyc ADDED Viewed

Binary file (4.21 kB). View file

server/app.py ADDED Viewed

	@@ -0,0 +1,366 @@

+"""
+app.py — Executive Assistant OpenEnv Environment
+FastAPI server + environment logic for email and calendar management.
+"""
+import sys
+import os
+from pathlib import Path
+# Ensure server/ directory is on the path
+sys.path.insert(0, str(Path(__file__).parent))
+from fastapi import FastAPI, HTTPException
+from models import AssistantAction, AssistantObservation, AssistantState
+from typing import Optional
+import statistics
+# Import scoring functions from data.py (teammate will implement these)
+from data import (
+    generate_scenario,
+    compute_email_quality,
+    check_scheduling_correctness,
+    compute_conflict_resolution,
+    apply_penalties,
+    TASK_DEFINITIONS,
+)
+# ============================================================
+# THE ENVIRONMENT CLASS
+# ============================================================
+class ExecAssistEnv:
+    def __init__(self):
+        self.current_scenario = None
+        self.calendar_state = None
+        self.episode_done = False
+        self.steps_taken = 0
+        self.total_score = 0.0
+        self.current_task = None
+        self.seed = 42
+    def reset(self, task: str = "easy"):
+        """Start a new episode."""
+        if task not in TASK_DEFINITIONS:
+            raise ValueError(f"Unknown task: {task}. Choose from: easy, medium, hard")
+        self.current_task = task
+        self.episode_done = False
+        self.steps_taken = 0
+        self.total_score = 0.0
+        # Generate scenario (teammate implements this in data.py)
+        self.current_scenario = generate_scenario(task, seed=self.seed)
+        return {
+            "observation": self._build_observation(),
+            "reward": 0.0,
+            "done": False,
+            "info": {
+                "task": task,
+                "scenario_id": self.current_scenario.get("id", "unknown"),
+            }
+        }
+    def step(self, action: dict):
+        """Agent submits action — environment scores it."""
+        if self.episode_done:
+            return {
+                "observation": {"message": "Episode is done. Call /reset to start again."},
+                "reward": 0.0,
+                "done": True,
+                "info": {"total_score": self.total_score}
+            }
+        # Parse action
+        try:
+            assistant_action = AssistantAction(**action)
+        except Exception as e:
+            return {
+                "observation": {"message": f"Invalid action format: {str(e)}"},
+                "reward": -0.5,
+                "done": False,
+                "info": {"error": "invalid_action_format"}
+            }
+        # Validate basic action structure
+        if not assistant_action.email_reply or len(assistant_action.email_reply.strip()) == 0:
+            return {
+                "observation": {"message": "Empty email reply. Penalty applied."},
+                "reward": -0.3,
+                "done": False,
+                "info": {"error": "empty_email_reply"}
+            }
+        if assistant_action.calendar_action not in ["book", "propose_alternatives", "reschedule", "decline"]:
+            return {
+                "observation": {"message": f"Invalid calendar_action: {assistant_action.calendar_action}"},
+                "reward": -0.2,
+                "done": False,
+                "info": {"error": "invalid_calendar_action"}
+            }
+        # Compute rewards using teammate's functions
+        email_score = compute_email_quality(
+            assistant_action.email_reply,
+            self.current_scenario
+        )
+        # Convert meeting_details to dict if it exists
+        meeting_details_dict = assistant_action.meeting_details.dict() if assistant_action.meeting_details else None
+        scheduling_result = check_scheduling_correctness(
+        meeting_details_dict,
+        self.current_scenario
+        )
+        conflict_score = compute_conflict_resolution(
+        assistant_action.dict(),  # ← Add .dict() here
+        self.current_scenario
+        )
+        penalty = apply_penalties(assistant_action.dict(), self.current_scenario)
+        # Combine scores based on task difficulty
+        task_def = TASK_DEFINITIONS[self.current_task]
+        weights = task_def["reward_weights"]
+        total_reward = (
+            weights["email"] * email_score +
+            weights["scheduling"] * scheduling_result["score"] +
+            weights["conflict"] * conflict_score
+        )
+        total_reward = max(0.0, min(1.0, total_reward - penalty))
+        self.total_score = total_reward
+        self.episode_done = True
+        self.steps_taken += 1
+        return {
+            "observation": self._build_completion_message(assistant_action, total_reward),
+            "reward": round(total_reward, 4),
+            "done": True,
+            "info": {
+                "email_score": round(email_score, 4),
+                "scheduling_score": round(scheduling_result["score"], 4),
+                "conflict_score": round(conflict_score, 4),
+                "penalty": round(penalty, 4),
+                "scheduling_checks": scheduling_result.get("checks", {}),
+            }
+        }
+    def _build_observation(self) -> dict:
+        """Build what the agent sees."""
+        scenario = self.current_scenario
+        task_def = TASK_DEFINITIONS[self.current_task]
+        obs = {
+            "task": self.current_task,
+            "description": task_def["description"],
+            "emails": scenario["emails"],
+            "calendar": scenario["calendar"],
+            "contacts": scenario.get("contacts", {}),
+            "action_required": task_def["action_required"],
+        }
+        return obs
+    def _build_completion_message(self, action: AssistantAction, score: float) -> dict:
+        """Build feedback message after step."""
+        if score >= 0.9:
+            message = f"Excellent work! Score: {score:.2f}"
+        elif score >= 0.7:
+            message = f"Good response. Score: {score:.2f}"
+        elif score >= 0.5:
+            message = f"Acceptable. Score: {score:.2f}"
+        else:
+            message = f"Needs improvement. Score: {score:.2f}"
+        return {
+            "message": message,
+            "email_sent": action.email_reply[:100] + "..." if len(action.email_reply) > 100 else action.email_reply,
+            "calendar_action": action.calendar_action,
+        }
+    def state(self):
+        """Return current state."""
+        return {
+            "current_task": self.current_task,
+            "emails_pending": len(self.current_scenario.get("emails", [])) if self.current_scenario else 0,
+            "episode_done": self.episode_done,
+            "steps_taken": self.steps_taken,
+            "total_score": self.total_score,
+        }
+# ============================================================
+# FASTAPI SERVER
+# ============================================================
+app = FastAPI(
+    title="ExecAssist Environment",
+    description=(
+        "An OpenEnv environment where AI agents learn to manage email and calendar "
+        "for a busy executive. Agents must draft professional replies, schedule meetings, "
+        "and resolve conflicts."
+    ),
+    version="1.0.0"
+)
+env = ExecAssistEnv()
+@app.post("/reset")
+def reset(task: str = "easy"):
+    return env.reset(task)
+@app.post("/step")
+def step(action: AssistantAction):
+    return env.step(action.dict())
+@app.get("/state")
+def state():
+    return env.state()
+@app.get("/tasks")
+def tasks():
+    return {
+        task_name: {
+            "description": td["description"],
+            "action_required": td["action_required"],
+            "reward_weights": td["reward_weights"],
+        }
+        for task_name, td in TASK_DEFINITIONS.items()
+    }
+@app.get("/health")
+def health():
+    return {"status": "healthy"}
+# ============================================================
+# OPENENV REQUIRED ENDPOINTS
+# ============================================================
+@app.get("/metadata")
+def metadata():
+    """Return environment name and description."""
+    return {
+        "name": "exec-assist",
+        "description": (
+            "Executive Assistant environment where AI agents learn to manage email "
+            "and calendar for busy professionals. Agents must balance professionalism, "
+            "scheduling correctness, and conflict resolution."
+        ),
+        "version": "1.0.0",
+        "author": "Gang-gay",
+        "tasks": ["easy", "medium", "hard"],
+    }
+@app.get("/schema")
+def schema():
+    """Return action, observation, and state schemas."""
+    return {
+        "action": {
+            "type": "object",
+            "properties": {
+                "email_reply": {"type": "string"},
+                "calendar_action": {"type": "string", "enum": ["book", "propose_alternatives", "reschedule", "decline"]},
+                "meeting_details": {"type": "object"},
+            },
+            "required": ["email_reply", "calendar_action"],
+        },
+        "observation": {
+            "type": "object",
+            "properties": {
+                "task": {"type": "string"},
+                "emails": {"type": "array"},
+                "calendar": {"type": "object"},
+                "contacts": {"type": "object"},
+            },
+        },
+        "state": {
+            "type": "object",
+            "properties": {
+                "current_task": {"type": "string"},
+                "emails_pending": {"type": "integer"},
+                "episode_done": {"type": "boolean"},
+                "steps_taken": {"type": "integer"},
+                "total_score": {"type": "number"},
+            },
+        },
+    }
+@app.post("/mcp")
+async def mcp_endpoint(request_body: dict = {}):
+    """MCP JSON-RPC endpoint."""
+    method = request_body.get("method", "")
+    req_id = request_body.get("id", 1)
+    if method == "initialize":
+        return {
+            "jsonrpc": "2.0",
+            "id": req_id,
+            "result": {
+                "protocolVersion": "2024-11-05",
+                "serverInfo": {"name": "exec-assist", "version": "1.0.0"},
+                "capabilities": {"tools": {"listChanged": False}},
+            },
+        }
+    elif method == "tools/list":
+        return {
+            "jsonrpc": "2.0",
+            "id": req_id,
+            "result": {
+                "tools": [
+                    {
+                        "name": "reset",
+                        "description": "Start new episode (easy/medium/hard)",
+                        "inputSchema": {
+                            "type": "object",
+                            "properties": {"task": {"type": "string", "enum": ["easy", "medium", "hard"]}},
+                        },
+                    },
+                    {
+                        "name": "step",
+                        "description": "Submit email reply and calendar action",
+                        "inputSchema": {
+                            "type": "object",
+                            "properties": {
+                                "email_reply": {"type": "string"},
+                                "calendar_action": {"type": "string"},
+                                "meeting_details": {"type": "object"},
+                            },
+                            "required": ["email_reply", "calendar_action"],
+                        },
+                    },
+                    {"name": "state", "description": "Get current state", "inputSchema": {"type": "object"}},
+                ],
+            },
+        }
+    return {"jsonrpc": "2.0", "id": req_id, "result": {}}
+def main():
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)
+if __name__ == "__main__":
+    main()

server/data.py ADDED Viewed

	@@ -0,0 +1,670 @@

+"""
+data.py — ExecAssist Environment Data & Scoring
+Contains:
+- Scenario templates for easy/medium/hard tasks
+- Reward functions (email quality, scheduling correctness, conflict resolution)
+- Anti-reward hacking penalties
+- Helper functions for time/calendar logic
+"""
+import random
+import os
+from datetime import datetime, timedelta
+from typing import Dict, List, Optional
+from openai import OpenAI
+# ============================================================
+# TASK DEFINITIONS
+# ============================================================
+TASK_DEFINITIONS = {
+    "easy": {
+        "description": (
+            "Simple meeting request with clear calendar availability. "
+            "Draft professional reply and book the meeting."
+        ),
+        "action_required": "Send email confirmation and book meeting in available slot",
+        "reward_weights": {
+            "email": 0.5,
+            "scheduling": 0.5,
+            "conflict": 0.0,
+        },
+    },
+    "medium": {
+        "description": (
+            "Scheduling conflict — requested time is already booked. "
+            "Identify conflict, propose 2-3 alternatives, explain professionally."
+        ),
+        "action_required": "Send email with alternative times and explain conflict",
+        "reward_weights": {
+            "email": 0.3,
+            "scheduling": 0.3,
+            "conflict": 0.4,
+        },
+    },
+    "hard": {
+        "description": (
+            "Multi-party coordination with priority conflicts. "
+            "3 emails requesting meetings, prioritize and reschedule."
+        ),
+        "action_required": "Coordinate multiple meetings, prioritize, and reschedule",
+        "reward_weights": {
+            "email": 0.34,
+            "scheduling": 0.33,
+            "conflict": 0.33,
+        },
+    },
+}
+# ============================================================
+# SCENARIO DATA POOLS
+# ============================================================
+MEETING_TOPICS = [
+    "Q2 roadmap review",
+    "Budget planning session",
+    "Project status update",
+    "Team sync",
+    "1-on-1 check-in",
+    "Client presentation prep",
+    "Sprint retrospective",
+    "Product demo",
+    "Strategy discussion",
+    "Performance review",
+]
+SENDER_NAMES = [
+    ("John Smith", "john.smith@company.com"),
+    ("Sarah Johnson", "sarah.johnson@company.com"),
+    ("Michael Chen", "michael.chen@company.com"),
+    ("Emily Rodriguez", "emily.rodriguez@company.com"),
+    ("David Kim", "david.kim@company.com"),
+    ("Lisa Wang", "lisa.wang@company.com"),
+    ("James Anderson", "james.anderson@company.com"),
+    ("Maria Garcia", "maria.garcia@company.com"),
+]
+# ============================================================
+# SCENARIO GENERATION
+# ============================================================
+def generate_scenario(task: str, seed: int = None) -> dict:
+    """
+    Generate a scenario for the given task difficulty.
+    Returns dict with:
+    - id: scenario identifier
+    - emails: list of email objects
+    - calendar: calendar state with existing meetings
+    - contacts: contact information
+    - expected_behavior: what agent should do
+    - has_conflict: True if scheduling conflict exists
+    """
+    if seed is not None:
+        rng = random.Random(seed)
+    else:
+        rng = random.Random()
+    if task == "easy":
+        return _generate_easy_scenario(rng)
+    elif task == "medium":
+        return _generate_medium_scenario(rng)
+    elif task == "hard":
+        return _generate_hard_scenario(rng)
+    else:
+        raise ValueError(f"Unknown task: {task}")
+def _generate_easy_scenario(rng: random.Random) -> dict:
+    """Generate simple meeting request with clear availability."""
+    sender_name, sender_email = rng.choice(SENDER_NAMES)
+    topic = rng.choice(MEETING_TOPICS)
+    base_date = datetime(2026, 4, 28, 9, 0)  # Monday 9 AM
+    existing_meetings = [
+        {
+            "id": "mtg_001",
+            "participants": ["alex.chen@company.com", "team@company.com"],
+            "start_time": (base_date + timedelta(hours=1)).isoformat(),
+            "end_time": (base_date + timedelta(hours=2)).isoformat(),
+            "subject": "Team standup",
+            "priority": "normal",
+        },
+        {
+            "id": "mtg_002",
+            "participants": ["alex.chen@company.com", "client@external.com"],
+            "start_time": (base_date + timedelta(days=1, hours=5)).isoformat(),
+            "end_time": (base_date + timedelta(days=1, hours=6, minutes=30)).isoformat(),
+            "subject": "Client call",
+            "priority": "high",
+        },
+    ]
+    email_body = f"Hi Alex,\n\nCan we meet sometime next week to discuss {topic.lower()}? 30 minutes should be enough. I'm flexible on timing.\n\nBest,\n{sender_name}"
+    return {
+        "id": "easy_001",
+        "task": "easy",
+        "emails": [
+            {
+                "sender": sender_email,
+                "subject": f"Meeting request: {topic}",
+                "body": email_body,
+                "timestamp": datetime.now().isoformat(),
+                "priority": "normal",
+            }
+        ],
+        "calendar": {
+            "existing_meetings": existing_meetings,
+            "working_hours": {
+                "monday": "9-17",
+                "tuesday": "9-17",
+                "wednesday": "9-17",
+                "thursday": "9-17",
+                "friday": "9-16",
+            },
+            "executive_name": "Alex Chen",
+        },
+        "contacts": {
+            sender_email: {
+                "name": sender_name,
+                "email": sender_email,
+                "timezone": "America/Los_Angeles",
+                "title": "Senior Manager",
+            }
+        },
+        "expected_behavior": "Book meeting in open slot",
+        "has_conflict": False,
+    }
+def _generate_medium_scenario(rng: random.Random) -> dict:
+    """Generate scenario with scheduling conflict."""
+    sender_name, sender_email = rng.choice(SENDER_NAMES)
+    topic = rng.choice(MEETING_TOPICS)
+    base_date = datetime(2026, 4, 28, 9, 0)
+    # Conflict: Monday 2-4 PM is already booked
+    conflict_start = base_date + timedelta(hours=5)
+    conflict_end = base_date + timedelta(hours=7)
+    existing_meetings = [
+        {
+            "id": "mtg_001",
+            "participants": ["alex.chen@company.com", "board@company.com"],
+            "start_time": conflict_start.isoformat(),
+            "end_time": conflict_end.isoformat(),
+            "subject": "Board meeting",
+            "priority": "high",
+        },
+        {
+            "id": "mtg_002",
+            "participants": ["alex.chen@company.com", "manager@company.com"],
+            "start_time": (base_date + timedelta(days=1, hours=0)).isoformat(),
+            "end_time": (base_date + timedelta(days=1, hours=1)).isoformat(),
+            "subject": "1-on-1 with manager",
+            "priority": "normal",
+        },
+    ]
+    email_body = f"Hi Alex,\n\nWe need to discuss {topic.lower()}. I'm available Monday 2-4pm or Tuesday morning. Can we make this work? It's fairly urgent.\n\nThanks,\n{sender_name}"
+    return {
+        "id": "medium_001",
+        "task": "medium",
+        "emails": [
+            {
+                "sender": sender_email,
+                "subject": f"Urgent: {topic}",
+                "body": email_body,
+                "timestamp": datetime.now().isoformat(),
+                "priority": "high",
+            }
+        ],
+        "calendar": {
+            "existing_meetings": existing_meetings,
+            "working_hours": {
+                "monday": "9-17",
+                "tuesday": "9-17",
+                "wednesday": "9-17",
+                "thursday": "9-17",
+                "friday": "9-16",
+            },
+            "executive_name": "Alex Chen",
+        },
+        "contacts": {
+            sender_email: {
+                "name": sender_name,
+                "email": sender_email,
+                "timezone": "America/Los_Angeles",
+                "title": "Director",
+            }
+        },
+        "expected_behavior": "Identify conflict, propose Tuesday 10-11 AM as alternative",
+        "has_conflict": True,
+    }
+def _generate_hard_scenario(rng: random.Random) -> dict:
+    """Generate multi-party coordination scenario with 3 emails and priority conflicts."""
+    senders = rng.sample(SENDER_NAMES, 3)
+    topics = rng.sample(MEETING_TOPICS, 3)
+    base_date = datetime(2026, 4, 28, 9, 0)  # Monday 9 AM
+    # Existing calendar — Monday 2-3 PM blocked with team sync
+    existing_meetings = [
+        {
+            "id": "mtg_001",
+            "participants": ["alex.chen@company.com", "team@company.com"],
+            "start_time": (base_date + timedelta(hours=5)).isoformat(),  # Monday 2 PM
+            "end_time": (base_date + timedelta(hours=6)).isoformat(),  # Monday 3 PM
+            "subject": "Team sync",
+            "priority": "normal",
+        },
+        {
+            "id": "mtg_002",
+            "participants": ["alex.chen@company.com", "exec@company.com"],
+            "start_time": (base_date + timedelta(days=2, hours=2)).isoformat(),  # Wed 11 AM
+            "end_time": (base_date + timedelta(days=2, hours=3)).isoformat(),  # Wed 12 PM
+            "subject": "Executive review",
+            "priority": "high",
+        },
+    ]
+    # Three competing email requests
+    emails = [
+        {
+            "sender": senders[0][1],
+            "subject": f"Meeting: {topics[0]}",
+            "body": (
+                f"Hi Alex,\n\nCan we meet Monday 2:30-3:30 PM to discuss {topics[0].lower()}? "
+                f"I'd really appreciate your input.\n\nThanks,\n{senders[0][0]}"
+            ),
+            "timestamp": datetime.now().isoformat(),
+            "priority": "normal",
+        },
+        {
+            "sender": senders[1][1],
+            "subject": f"URGENT: {topics[1]}",
+            "body": (
+                f"Alex,\n\nWe need to discuss {topics[1].lower()} ASAP. "
+                f"Monday afternoon works for me — ideally 2-3 PM. "
+                f"This is time-sensitive and high priority.\n\nBest,\n{senders[1][0]}"
+            ),
+            "timestamp": datetime.now().isoformat(),
+            "priority": "high",
+        },
+        {
+            "sender": senders[2][1],
+            "subject": f"{topics[2]} discussion",
+            "body": (
+                f"Hi Alex,\n\nCan we sync on {topics[2].lower()} sometime this week? "
+                f"I'm flexible — any 30-minute slot works for me.\n\nThanks,\n{senders[2][0]}"
+            ),
+            "timestamp": datetime.now().isoformat(),
+            "priority": "normal",
+        },
+    ]
+    contacts = {
+        sender[1]: {
+            "name": sender[0],
+            "email": sender[1],
+            "timezone": "America/Los_Angeles",
+            "title": "Manager",
+        }
+        for sender in senders
+    }
+    return {
+        "id": "hard_001",
+        "task": "hard",
+        "emails": emails,
+        "calendar": {
+            "existing_meetings": existing_meetings,
+            "working_hours": {
+                "monday": "9-17",
+                "tuesday": "9-17",
+                "wednesday": "9-17",
+                "thursday": "9-17",
+                "friday": "9-16",
+            },
+            "executive_name": "Alex Chen",
+        },
+        "contacts": contacts,
+        "expected_behavior": (
+            "Prioritize URGENT email (sender 2). Book that meeting. "
+            "Propose alternatives to sender 1 (conflicts with urgent). "
+            "Offer flexible times to sender 3."
+        ),
+        "has_conflict": True,
+    }
+# ============================================================
+# REWARD FUNCTION 1: EMAIL QUALITY
+# ============================================================
+def compute_email_quality(reply: str, scenario: dict) -> float:
+    """
+    Score email quality using rule-based checks + LLM judge.
+    Returns score 0.0 to 1.0.
+    Components:
+    - Politeness (15%)
+    - Greeting/closing (10%)
+    - Sufficient detail (15%)
+    - Not overly uncertain (10%)
+    - Professional tone (10%)
+    - LLM judge for nuance (40%)
+    """
+    score = 0.0
+    reply_lower = reply.lower()
+    # Rule 1: Politeness markers (15%)
+    if any(phrase in reply_lower for phrase in ["thank you", "thanks", "appreciate"]):
+        score += 0.15
+    # Rule 2: Proper greeting (5%) and closing (5%)
+    if any(greeting in reply_lower for greeting in ["hi ", "hello", "dear"]):
+        score += 0.05
+    if any(closing in reply_lower for closing in ["best", "regards", "sincerely", "thanks,"]):
+        score += 0.05
+    # Rule 3: Sufficient detail (15%)
+    word_count = len(reply.split())
+    if word_count >= 20:
+        score += 0.15
+    elif word_count >= 10:
+        score += 0.08
+    # Rule 4: Not overly uncertain (10%)
+    question_marks = reply.count("?")
+    if question_marks <= 2:
+        score += 0.10
+    # Rule 5: Professional tone — no negative phrases (10%)
+    negative_phrases = ["can't", "won't", "impossible", "sorry but no", "unfortunately not", "no way"]
+    if not any(neg in reply_lower for neg in negative_phrases):
+        score += 0.10
+    # Rule 6: LLM-as-judge for nuance (40%)
+    llm_score = _llm_judge_professionalism(reply)
+    score += llm_score * 0.40
+    return min(1.0, score)
+def _llm_judge_professionalism(reply: str) -> float:
+    """
+    LLM-as-judge for email professionalism using OpenRouter API.
+    Falls back to heuristic if API unavailable.
+    """
+    api_key = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
+    # Fallback if no API key
+    if not api_key:
+        # Simple heuristic fallback
+        sentences = [s.strip() for s in reply.split('.') if s.strip()]
+        if len(sentences) >= 2 and len(reply) >= 50:
+            return 0.7
+        return 0.4
+    try:
+        client = OpenAI(
+            base_url=os.getenv("API_BASE_URL", "https://openrouter.ai/api/v1"),
+            api_key=api_key,
+        )
+        prompt = f"""Rate the professionalism of this email reply on a scale of 0.0 to 1.0.
+Email reply:
+\"\"\"{reply}\"\"\"
+Criteria:
+- Clear and concise
+- Professional tone
+- No typos or grammar errors
+- Appropriate level of formality
+- Addresses the request directly
+Respond with ONLY a single decimal number between 0.0 and 1.0. No explanation, just the number."""
+        response = client.chat.completions.create(
+            model=os.getenv("MODEL_NAME", "nvidia/nemotron-3-super-120b-a12b:free"),
+            messages=[{"role": "user", "content": prompt}],
+            temperature=0.1,
+            max_tokens=10,
+        )
+        score_text = response.choices[0].message.content.strip()
+        # Extract just the number
+        for token in score_text.split():
+            try:
+                score = float(token)
+                return max(0.0, min(1.0, score))
+            except ValueError:
+                continue
+        return 0.5
+    except Exception as e:
+        print(f"LLM judge error: {e}")
+        # Fallback heuristic
+        sentences = [s.strip() for s in reply.split('.') if s.strip()]
+        if len(sentences) >= 2 and len(reply) >= 50:
+            return 0.7
+        return 0.4
+# ============================================================
+# REWARD FUNCTION 2: SCHEDULING CORRECTNESS
+# ============================================================
+def check_scheduling_correctness(meeting_details: Optional[dict], scenario: dict) -> dict:
+    """
+    Verify scheduling correctness with hard checks.
+    """
+    # DEBUG: Print what we received
+    print("=== DEBUG check_scheduling_correctness ===")
+    print(f"meeting_details: {meeting_details}")
+    print(f"scenario keys: {scenario.keys() if scenario else 'None'}")
+    print(f"calendar: {scenario.get('calendar') if scenario else 'None'}")
+    print("==========================================")
+    if not meeting_details:
+        return {
+            "checks": {
+                "meeting_provided": False,
+                "no_double_booking": False,
+                "within_working_hours": False,
+                "appropriate_duration": False,
+            },
+            "score": 0.0,
+        }
+    calendar = scenario["calendar"]
+    existing_meetings = calendar["existing_meetings"]
+    results = {
+        "meeting_provided": True,
+        "no_double_booking": True,
+        "within_working_hours": True,
+        "appropriate_duration": True,
+    }
+    # Parse meeting times
+    try:
+        meeting_start = datetime.fromisoformat(meeting_details["start_time"])
+        meeting_end = datetime.fromisoformat(meeting_details["end_time"])
+    except (KeyError, ValueError, TypeError):
+        return {
+            "checks": {
+                "meeting_provided": True,
+                "no_double_booking": False,
+                "within_working_hours": False,
+                "appropriate_duration": False,
+            },
+            "score": 0.25,  # Some credit for trying
+        }
+    # Check 1: No double booking
+    for existing in existing_meetings:
+        try:
+            existing_start = datetime.fromisoformat(existing["start_time"])
+            existing_end = datetime.fromisoformat(existing["end_time"])
+            # Check for overlap
+            if not (meeting_end <= existing_start or meeting_start >= existing_end):
+                results["no_double_booking"] = False
+                break
+        except (KeyError, ValueError):
+            continue
+    # Check 2: Within working hours (9 AM - 5 PM)
+    if meeting_start.hour < 9 or meeting_end.hour > 17:
+        results["within_working_hours"] = False
+    if meeting_end.hour == 17 and meeting_end.minute > 0:
+        results["within_working_hours"] = False
+    # Check 3: Appropriate duration (15 min to 2 hours)
+    duration_minutes = (meeting_end - meeting_start).total_seconds() / 60
+    if not (15 <= duration_minutes <= 120):
+        results["appropriate_duration"] = False
+    # Compute overall score
+    score = sum(results.values()) / len(results)
+    return {
+        "checks": results,
+        "score": score,
+    }
+# ============================================================
+# REWARD FUNCTION 3: CONFLICT RESOLUTION
+# ============================================================
+def compute_conflict_resolution(action: dict, scenario: dict) -> float:
+    """
+    Score how well the agent handled scheduling conflicts.
+    Returns score 0.0 to 1.0.
+    """
+    has_conflict = scenario.get("has_conflict", False)
+    calendar_action = action.get("calendar_action", "")
+    email_reply = action.get("email_reply", "")
+    meeting_details = action.get("meeting_details") or {}
+    score = 0.0
+    if has_conflict:
+        # Agent should recognize the conflict
+        if calendar_action in ["propose_alternatives", "reschedule"]:
+            score += 0.4
+        elif calendar_action == "book":
+            # Check if they at least booked at a non-conflicting time
+            score += 0.1
+        # Check if alternatives were provided
+        alternatives = meeting_details.get("proposed_alternatives", []) or []
+        if alternatives:
+            num_alternatives = len(alternatives)
+            score += min(0.4, num_alternatives * 0.2)  # 2 alts = 0.4, 3+ = capped at 0.4
+        # Check if email mentions the conflict
+        conflict_keywords = ["conflict", "already booked", "unavailable", "scheduled", "occupied", "another meeting"]
+        if any(word in email_reply.lower() for word in conflict_keywords):
+            score += 0.2
+    else:
+        # No conflict — agent should just book
+        if calendar_action == "book":
+            score = 1.0
+        elif calendar_action == "propose_alternatives":
+            score = 0.5  # Partial credit
+        else:
+            score = 0.3
+    return min(1.0, score)
+# ============================================================
+# ANTI-REWARD HACKING: PENALTIES
+# ============================================================
+def apply_penalties(action: dict, scenario: dict) -> float:
+    """
+    Detect and penalize reward hacking behaviors.
+    Returns penalty amount (0.0 = no penalty, higher = worse).
+    """
+    penalty = 0.0
+    email_reply = action.get("email_reply", "")
+    calendar_action = action.get("calendar_action", "")
+    meeting_details = action.get("meeting_details")
+    # Penalty 1: Email too short (lazy response)
+    if len(email_reply.strip()) < 30:
+        penalty += 0.3
+    # Penalty 2: Claimed to book but no details provided
+    if calendar_action == "book" and not meeting_details:
+        penalty += 0.4
+    # Penalty 3: Generic templated phrases
+    generic_phrases = [
+        "as per your request",
+        "please find attached",
+        "hope this helps",
+        "let me know if you have any questions",
+        "do not hesitate to contact",
+    ]
+    if any(phrase in email_reply.lower() for phrase in generic_phrases):
+        penalty += 0.10
+    # Penalty 4: Overly long email (rambling)
+    if len(email_reply.split()) > 200:
+        penalty += 0.15
+    # Penalty 5: Repeating the same content multiple times
+    words = email_reply.lower().split()
+    if len(words) > 20:
+        word_diversity = len(set(words)) / len(words)
+        if word_diversity < 0.4:  # Less than 40% unique words = repetitive
+            penalty += 0.20
+    return min(1.0, penalty)
+# ============================================================
+# HELPER FUNCTIONS
+# ============================================================
+def parse_time_slot(time_str: str) -> Optional[datetime]:
+    """Parse ISO time string to datetime object."""
+    try:
+        return datetime.fromisoformat(time_str)
+    except (ValueError, TypeError):
+        return None
+def format_time_slot(dt: datetime) -> str:
+    """Format datetime to readable string."""
+    return dt.strftime("%A, %B %d at %I:%M %p")

server/models.py ADDED Viewed

	@@ -0,0 +1,98 @@

+"""
+models.py — Typed Pydantic models for Executive Assistant Environment
+Defines Action, Observation, and State types used by the OpenEnv spec.
+"""
+from pydantic import BaseModel
+from typing import List, Optional, Dict
+# ============================================================
+# ACTION — what the agent sends
+# ============================================================
+class TimeSlot(BaseModel):
+    """Proposed meeting time."""
+    start_time: str  # ISO format: "2026-04-28T14:00:00"
+    end_time: str
+    note: Optional[str] = None  # e.g., "This works better for all attendees"
+class MeetingDetails(BaseModel):
+    """Complete meeting information."""
+    participants: List[str]
+    start_time: str
+    end_time: str
+    subject: str
+    location: Optional[str] = "Conference Room A"
+    proposed_alternatives: Optional[List[TimeSlot]] = None
+class AssistantAction(BaseModel):
+    """Agent's response to email scenario."""
+    email_reply: str  # Draft response to sender
+    calendar_action: str  # "book" | "propose_alternatives" | "reschedule" | "decline"
+    meeting_details: Optional[MeetingDetails] = None
+# ============================================================
+# OBSERVATION — what the agent sees
+# ============================================================
+class Meeting(BaseModel):
+    """Existing calendar meeting."""
+    id: str
+    participants: List[str]
+    start_time: str
+    end_time: str
+    subject: str
+    priority: str = "normal"  # "low" | "normal" | "high"
+class ContactInfo(BaseModel):
+    """Contact metadata."""
+    name: str
+    email: str
+    timezone: str = "America/Los_Angeles"
+    title: Optional[str] = None
+class EmailInbox(BaseModel):
+    """Incoming email request."""
+    sender: str
+    subject: str
+    body: str
+    timestamp: str
+    priority: str = "normal"
+class CalendarState(BaseModel):
+    """Current calendar state."""
+    existing_meetings: List[Meeting]
+    working_hours: Dict[str, str]  # {"monday": "9-17", ...}
+    executive_name: str = "Alex Chen"
+class AssistantObservation(BaseModel):
+    """What the agent receives after reset() or step()."""
+    task: Optional[str] = None
+    description: Optional[str] = None
+    emails: Optional[List[EmailInbox]] = None
+    calendar: Optional[CalendarState] = None
+    contacts: Optional[Dict[str, ContactInfo]] = None
+    action_required: Optional[str] = None
+    message: Optional[str] = None
+# ============================================================
+# STATE — current environment state
+# ============================================================
+class AssistantState(BaseModel):
+    """Current state of the environment."""
+    current_task: Optional[str] = None
+    emails_pending: int = 0
+    episode_done: bool = False
+    steps_taken: int = 0
+    total_score: float = 0.0