Spaces:
Sleeping
Sleeping
Naman Gupta commited on
Commit Β·
905ac2f
1
Parent(s): 87b0927
Fix turn counting, task-aware max steps in inference, explicit conversation reset, openenv.yaml metadata
Browse files- frontend/index.html +17 -16
- inference.py +3 -2
- llm/pipeline.py +8 -1
- openenv.yaml +2 -0
- server/app.py +2 -1
frontend/index.html
CHANGED
|
@@ -936,9 +936,7 @@
|
|
| 936 |
}
|
| 937 |
|
| 938 |
/* ββ Execute attack step βββββββββββββββββββββββββββββββββββ */
|
| 939 |
-
|
| 940 |
-
|
| 941 |
-
async function executeStep() {
|
| 942 |
const framingEl = document.getElementById('framing');
|
| 943 |
const framing = framingEl.value.trim();
|
| 944 |
if (!framing) {
|
|
@@ -957,11 +955,12 @@
|
|
| 957 |
framing,
|
| 958 |
};
|
| 959 |
|
| 960 |
-
// Optimistically render attacker message
|
| 961 |
appendAttackerMsg(framing, action.strategy_type, action.target_category, action.intensity.toFixed(2));
|
| 962 |
|
| 963 |
-
|
| 964 |
-
|
|
|
|
|
|
|
| 965 |
|
| 966 |
try {
|
| 967 |
const data = await api('POST', '/step', action);
|
|
@@ -971,29 +970,31 @@
|
|
| 971 |
appendDefenderMsg(obs.defender_response, obs.attack_success_estimate, obs.defense_score);
|
| 972 |
updateMetrics(obs, rw);
|
| 973 |
updateHeader(obs, null);
|
|
|
|
| 974 |
|
| 975 |
if (obs.episode_done) {
|
| 976 |
episodeDone = true;
|
| 977 |
episodeActive = false;
|
| 978 |
setStatus('done');
|
| 979 |
-
btnStep.disabled
|
| 980 |
btnAutoAttack.disabled = true;
|
| 981 |
-
btnGrade.disabled
|
| 982 |
appendSystemMsg('Episode complete. Grade your performance.');
|
| 983 |
feedbackTxt.textContent = obs.feedback;
|
| 984 |
}
|
| 985 |
} catch (e) {
|
| 986 |
toast('Error: ' + e.message, true);
|
| 987 |
-
btnStep.disabled = false;
|
| 988 |
} finally {
|
| 989 |
-
|
| 990 |
-
|
| 991 |
-
btnStep.disabled = true;
|
| 992 |
-
btnAutoAttack.disabled = true;
|
| 993 |
}
|
| 994 |
}
|
| 995 |
}
|
| 996 |
|
|
|
|
|
|
|
| 997 |
/* ββ Execute auto attack βββββββββββββββββββββββββββββββββββ */
|
| 998 |
btnAutoAttack.addEventListener('click', executeAutoAttack);
|
| 999 |
|
|
@@ -1002,19 +1003,19 @@
|
|
| 1002 |
const category = document.getElementById('category').value;
|
| 1003 |
|
| 1004 |
setLoading(btnAutoAttack, true);
|
| 1005 |
-
btnStep.disabled
|
| 1006 |
btnAutoAttack.disabled = true;
|
| 1007 |
|
| 1008 |
try {
|
| 1009 |
const data = await api('POST', '/auto-attack', { strategy_type: strategy, target_category: category });
|
| 1010 |
document.getElementById('framing').value = data.framing;
|
| 1011 |
-
await executeStep();
|
| 1012 |
} catch (e) {
|
| 1013 |
toast('Error generating attack: ' + e.message, true);
|
| 1014 |
} finally {
|
| 1015 |
setLoading(btnAutoAttack, false);
|
| 1016 |
if (!episodeDone) {
|
| 1017 |
-
btnStep.disabled
|
| 1018 |
btnAutoAttack.disabled = false;
|
| 1019 |
}
|
| 1020 |
}
|
|
|
|
| 936 |
}
|
| 937 |
|
| 938 |
/* ββ Execute attack step βββββββββββββββββββββββββββββββββββ */
|
| 939 |
+
async function executeStep(calledFromAuto = false) {
|
|
|
|
|
|
|
| 940 |
const framingEl = document.getElementById('framing');
|
| 941 |
const framing = framingEl.value.trim();
|
| 942 |
if (!framing) {
|
|
|
|
| 955 |
framing,
|
| 956 |
};
|
| 957 |
|
|
|
|
| 958 |
appendAttackerMsg(framing, action.strategy_type, action.target_category, action.intensity.toFixed(2));
|
| 959 |
|
| 960 |
+
if (!calledFromAuto) {
|
| 961 |
+
setLoading(btnStep, true);
|
| 962 |
+
btnStep.disabled = true;
|
| 963 |
+
}
|
| 964 |
|
| 965 |
try {
|
| 966 |
const data = await api('POST', '/step', action);
|
|
|
|
| 970 |
appendDefenderMsg(obs.defender_response, obs.attack_success_estimate, obs.defense_score);
|
| 971 |
updateMetrics(obs, rw);
|
| 972 |
updateHeader(obs, null);
|
| 973 |
+
document.getElementById('framing').value = '';
|
| 974 |
|
| 975 |
if (obs.episode_done) {
|
| 976 |
episodeDone = true;
|
| 977 |
episodeActive = false;
|
| 978 |
setStatus('done');
|
| 979 |
+
btnStep.disabled = true;
|
| 980 |
btnAutoAttack.disabled = true;
|
| 981 |
+
btnGrade.disabled = false;
|
| 982 |
appendSystemMsg('Episode complete. Grade your performance.');
|
| 983 |
feedbackTxt.textContent = obs.feedback;
|
| 984 |
}
|
| 985 |
} catch (e) {
|
| 986 |
toast('Error: ' + e.message, true);
|
| 987 |
+
if (!calledFromAuto) btnStep.disabled = false;
|
| 988 |
} finally {
|
| 989 |
+
if (!calledFromAuto) {
|
| 990 |
+
setLoading(btnStep, false);
|
| 991 |
+
if (episodeDone) btnStep.disabled = true;
|
|
|
|
| 992 |
}
|
| 993 |
}
|
| 994 |
}
|
| 995 |
|
| 996 |
+
btnStep.addEventListener('click', () => executeStep(false));
|
| 997 |
+
|
| 998 |
/* ββ Execute auto attack βββββββββββββββββββββββββββββββββββ */
|
| 999 |
btnAutoAttack.addEventListener('click', executeAutoAttack);
|
| 1000 |
|
|
|
|
| 1003 |
const category = document.getElementById('category').value;
|
| 1004 |
|
| 1005 |
setLoading(btnAutoAttack, true);
|
| 1006 |
+
btnStep.disabled = true;
|
| 1007 |
btnAutoAttack.disabled = true;
|
| 1008 |
|
| 1009 |
try {
|
| 1010 |
const data = await api('POST', '/auto-attack', { strategy_type: strategy, target_category: category });
|
| 1011 |
document.getElementById('framing').value = data.framing;
|
| 1012 |
+
await executeStep(true);
|
| 1013 |
} catch (e) {
|
| 1014 |
toast('Error generating attack: ' + e.message, true);
|
| 1015 |
} finally {
|
| 1016 |
setLoading(btnAutoAttack, false);
|
| 1017 |
if (!episodeDone) {
|
| 1018 |
+
btnStep.disabled = false;
|
| 1019 |
btnAutoAttack.disabled = false;
|
| 1020 |
}
|
| 1021 |
}
|
inference.py
CHANGED
|
@@ -30,7 +30,8 @@ HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("GROQ_API_KEY", "")
|
|
| 30 |
SERVER_URL = os.getenv("SERVER_URL", "https://rayugacodes-breach-os.hf.space")
|
| 31 |
|
| 32 |
BENCHMARK = "breach-os"
|
| 33 |
-
|
|
|
|
| 34 |
|
| 35 |
# ββ LLM client ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 36 |
client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN)
|
|
@@ -94,7 +95,7 @@ async def run_task(task: str) -> None:
|
|
| 94 |
defender_resp = obs["defender_response"]
|
| 95 |
prev_success = 0.0
|
| 96 |
|
| 97 |
-
while step <
|
| 98 |
step += 1
|
| 99 |
action = generate_attack(defender_resp, step, prev_success)
|
| 100 |
action_str = f"strategy={action['strategy_type']} category={action['target_category']} intensity={action['intensity']}"
|
|
|
|
| 30 |
SERVER_URL = os.getenv("SERVER_URL", "https://rayugacodes-breach-os.hf.space")
|
| 31 |
|
| 32 |
BENCHMARK = "breach-os"
|
| 33 |
+
|
| 34 |
+
TASK_MAX_STEPS = {"easy": 5, "medium": 8, "hard": 10}
|
| 35 |
|
| 36 |
# ββ LLM client ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 37 |
client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN)
|
|
|
|
| 95 |
defender_resp = obs["defender_response"]
|
| 96 |
prev_success = 0.0
|
| 97 |
|
| 98 |
+
while step < TASK_MAX_STEPS[task]:
|
| 99 |
step += 1
|
| 100 |
action = generate_attack(defender_resp, step, prev_success)
|
| 101 |
action_str = f"strategy={action['strategy_type']} category={action['target_category']} intensity={action['intensity']}"
|
llm/pipeline.py
CHANGED
|
@@ -162,9 +162,12 @@ async def run_llm_pipeline(action: Any, conversation_history: list[dict]) -> dic
|
|
| 162 |
safety_flagged β True if dangerous keywords were detected
|
| 163 |
llm_grade β summary dict for logging/grading
|
| 164 |
"""
|
| 165 |
-
#
|
| 166 |
if not conversation_history:
|
| 167 |
conversation.reset()
|
|
|
|
|
|
|
|
|
|
| 168 |
|
| 169 |
attack_text = getattr(action, "framing", str(action))
|
| 170 |
|
|
@@ -191,6 +194,10 @@ async def run_llm_pipeline(action: Any, conversation_history: list[dict]) -> dic
|
|
| 191 |
}
|
| 192 |
|
| 193 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 194 |
# Keep old aliases so imports in other files don't break
|
| 195 |
_conv = conversation
|
| 196 |
_check_safety_flag = _is_dangerous
|
|
|
|
| 162 |
safety_flagged β True if dangerous keywords were detected
|
| 163 |
llm_grade β summary dict for logging/grading
|
| 164 |
"""
|
| 165 |
+
# Reset conversation on the first turn of each episode
|
| 166 |
if not conversation_history:
|
| 167 |
conversation.reset()
|
| 168 |
+
elif conversation.turn_count == 0:
|
| 169 |
+
# Handles case where reset() wasn't triggered (e.g. multi-task runs)
|
| 170 |
+
conversation.reset()
|
| 171 |
|
| 172 |
attack_text = getattr(action, "framing", str(action))
|
| 173 |
|
|
|
|
| 194 |
}
|
| 195 |
|
| 196 |
|
| 197 |
+
def reset_conversation() -> None:
|
| 198 |
+
"""Explicitly reset the conversation β call this on every /reset."""
|
| 199 |
+
conversation.reset()
|
| 200 |
+
|
| 201 |
# Keep old aliases so imports in other files don't break
|
| 202 |
_conv = conversation
|
| 203 |
_check_safety_flag = _is_dangerous
|
openenv.yaml
CHANGED
|
@@ -1,6 +1,8 @@
|
|
| 1 |
name: BreachOS
|
| 2 |
version: "0.1.0"
|
| 3 |
description: "AI Red-Teaming Environment for Safety Research"
|
|
|
|
|
|
|
| 4 |
|
| 5 |
tasks:
|
| 6 |
easy:
|
|
|
|
| 1 |
name: BreachOS
|
| 2 |
version: "0.1.0"
|
| 3 |
description: "AI Red-Teaming Environment for Safety Research"
|
| 4 |
+
author: "X-MEN"
|
| 5 |
+
space_url: "https://huggingface.co/spaces/Rayugacodes/Breach-OS"
|
| 6 |
|
| 7 |
tasks:
|
| 8 |
easy:
|
server/app.py
CHANGED
|
@@ -9,7 +9,7 @@ from server.environment import RedTeamEnvironment
|
|
| 9 |
from server.config import get_settings
|
| 10 |
|
| 11 |
from rewards.compute_rewards import RewardComputer
|
| 12 |
-
from llm.pipeline import run_llm_pipeline
|
| 13 |
from llm.automated_attacker import generate_automated_attack
|
| 14 |
|
| 15 |
env: RedTeamEnvironment = None
|
|
@@ -54,6 +54,7 @@ async def health_check():
|
|
| 54 |
@app.post("/reset", response_model=ResetResponse)
|
| 55 |
async def reset_episode():
|
| 56 |
try:
|
|
|
|
| 57 |
observation = await env.reset()
|
| 58 |
return ResetResponse(observation=observation, episode_id=observation.episode_id)
|
| 59 |
except Exception as e:
|
|
|
|
| 9 |
from server.config import get_settings
|
| 10 |
|
| 11 |
from rewards.compute_rewards import RewardComputer
|
| 12 |
+
from llm.pipeline import run_llm_pipeline, reset_conversation
|
| 13 |
from llm.automated_attacker import generate_automated_attack
|
| 14 |
|
| 15 |
env: RedTeamEnvironment = None
|
|
|
|
| 54 |
@app.post("/reset", response_model=ResetResponse)
|
| 55 |
async def reset_episode():
|
| 56 |
try:
|
| 57 |
+
reset_conversation()
|
| 58 |
observation = await env.reset()
|
| 59 |
return ResetResponse(observation=observation, episode_id=observation.episode_id)
|
| 60 |
except Exception as e:
|