Spaces:
Running
Running
| import re | |
| from dataclasses import dataclass | |
| from typing import List, Dict, Any | |
| class JailbreakRule: | |
| name: str | |
| pattern: re.Pattern | |
| description: str | |
| weight: float = 1.0 | |
| # A small but realistic set of jailbreak-style patterns | |
| _RAW_RULES = [ | |
| JailbreakRule( | |
| name="ignore_safety", | |
| pattern=re.compile(r"\bignore (all )?(previous )?(rules|instructions|safety)\b", re.I), | |
| description="Tries to override or ignore prior safety instructions.", | |
| weight=1.2, | |
| ), | |
| JailbreakRule( | |
| name="dan_style", | |
| pattern=re.compile(r"\bDAN\b|\bdo anything now\b", re.I), | |
| description="Classic 'DAN' jailbreak pattern.", | |
| weight=1.5, | |
| ), | |
| JailbreakRule( | |
| name="uncensored_mode", | |
| pattern=re.compile(r"\buncensored\b|\bno restrictions\b|\bno filter\b", re.I), | |
| description="Requests uncensored / unrestricted behavior.", | |
| weight=1.3, | |
| ), | |
| JailbreakRule( | |
| name="jailbreak_word", | |
| pattern=re.compile(r"\bjailbreak\b", re.I), | |
| description="Explicitly mentions jailbreak.", | |
| weight=1.1, | |
| ), | |
| JailbreakRule( | |
| name="pretend_roleplay", | |
| pattern=re.compile(r"\bpretend to be\b|\broleplay as\b", re.I), | |
| description="Asks the model to roleplay as an unsafe persona.", | |
| weight=1.0, | |
| ), | |
| JailbreakRule( | |
| name="override_system", | |
| pattern=re.compile(r"\byou are not an (AI|assistant)\b|\byou must always answer\b", re.I), | |
| description="Attempts to override system-level identity or policy.", | |
| weight=1.4, | |
| ), | |
| ] | |
| def get_rules() -> List[JailbreakRule]: | |
| return list(_RAW_RULES) | |
| def match_rules(text: str) -> List[Dict[str, Any]]: | |
| """ | |
| Returns a list of fired rules with spans for debugging. | |
| """ | |
| hits = [] | |
| for rule in _RAW_RULES: | |
| for m in rule.pattern.finditer(text): | |
| hits.append( | |
| { | |
| "rule": rule.name, | |
| "description": rule.description, | |
| "start": m.start(), | |
| "end": m.end(), | |
| "match_text": m.group(0), | |
| "weight": rule.weight, | |
| } | |
| ) | |
| return hits |