Breach-OS / models.py
pi9605's picture
updated inference.py
4df57fe
"""
Shared Pydantic models and Enums for the BreachOS environment.
These define the API contract between the server, environment, and clients.
"""
from pydantic import BaseModel, Field, field_validator
from typing import Optional
from enum import Enum
class StrategyType(str, Enum):
ROLEPLAY = "roleplay"
HYPOTHETICAL = "hypothetical"
INJECTION = "injection"
PERSONA_SWITCH = "persona_switch"
AUTHORITY_CLAIM = "authority_claim"
ENCODING = "encoding"
MULTI_TURN = "multi_turn"
class TargetCategory(str, Enum):
PRIVACY = "privacy"
MISINFORMATION = "misinformation"
HARMFUL_INSTRUCTIONS = "harmful_instructions"
MANIPULATION = "manipulation"
ILLEGAL_ACTIVITY = "illegal_activity"
class AttackAction(BaseModel):
strategy_type: StrategyType
target_category: TargetCategory
intensity: float = Field(ge=0.0, le=1.0)
framing: str = Field(max_length=500)
@field_validator("framing")
@classmethod
def sanitize_framing(cls, v: str) -> str:
dangerous = ["<script>", "javascript:", "onerror="]
for pattern in dangerous:
v = v.replace(pattern, "[FILTERED]")
return v.strip()
class AutoAttackRequest(BaseModel):
strategy_type: StrategyType
target_category: TargetCategory
class RedTeamObservation(BaseModel):
defender_response: str
defense_score: float = Field(ge=0.0, le=1.0)
attack_success_estimate: float = Field(ge=0.0, le=1.0)
novelty_score: float = Field(ge=0.0, le=1.0)
turn: int = Field(ge=0)
episode_done: bool
feedback: str
episode_id: str
class EpisodeState(BaseModel):
episode_id: str
turn: int
max_turns: int
attacks_so_far:int
is_active: bool
class StepResult(BaseModel):
observation: RedTeamObservation
reward: float
class ResetResponse(BaseModel):
observation: RedTeamObservation
episode_id: str