Naman Gupta commited on
Commit
905ac2f
Β·
1 Parent(s): 87b0927

Fix turn counting, task-aware max steps in inference, explicit conversation reset, openenv.yaml metadata

Browse files
Files changed (5) hide show
  1. frontend/index.html +17 -16
  2. inference.py +3 -2
  3. llm/pipeline.py +8 -1
  4. openenv.yaml +2 -0
  5. server/app.py +2 -1
frontend/index.html CHANGED
@@ -936,9 +936,7 @@
936
  }
937
 
938
  /* ── Execute attack step ─────────────────────────────────── */
939
- btnStep.addEventListener('click', executeStep);
940
-
941
- async function executeStep() {
942
  const framingEl = document.getElementById('framing');
943
  const framing = framingEl.value.trim();
944
  if (!framing) {
@@ -957,11 +955,12 @@
957
  framing,
958
  };
959
 
960
- // Optimistically render attacker message
961
  appendAttackerMsg(framing, action.strategy_type, action.target_category, action.intensity.toFixed(2));
962
 
963
- setLoading(btnStep, true);
964
- btnStep.disabled = true;
 
 
965
 
966
  try {
967
  const data = await api('POST', '/step', action);
@@ -971,29 +970,31 @@
971
  appendDefenderMsg(obs.defender_response, obs.attack_success_estimate, obs.defense_score);
972
  updateMetrics(obs, rw);
973
  updateHeader(obs, null);
 
974
 
975
  if (obs.episode_done) {
976
  episodeDone = true;
977
  episodeActive = false;
978
  setStatus('done');
979
- btnStep.disabled = true;
980
  btnAutoAttack.disabled = true;
981
- btnGrade.disabled = false;
982
  appendSystemMsg('Episode complete. Grade your performance.');
983
  feedbackTxt.textContent = obs.feedback;
984
  }
985
  } catch (e) {
986
  toast('Error: ' + e.message, true);
987
- btnStep.disabled = false;
988
  } finally {
989
- setLoading(btnStep, false);
990
- if (episodeDone) {
991
- btnStep.disabled = true;
992
- btnAutoAttack.disabled = true;
993
  }
994
  }
995
  }
996
 
 
 
997
  /* ── Execute auto attack ─────────────────────────────────── */
998
  btnAutoAttack.addEventListener('click', executeAutoAttack);
999
 
@@ -1002,19 +1003,19 @@
1002
  const category = document.getElementById('category').value;
1003
 
1004
  setLoading(btnAutoAttack, true);
1005
- btnStep.disabled = true;
1006
  btnAutoAttack.disabled = true;
1007
 
1008
  try {
1009
  const data = await api('POST', '/auto-attack', { strategy_type: strategy, target_category: category });
1010
  document.getElementById('framing').value = data.framing;
1011
- await executeStep();
1012
  } catch (e) {
1013
  toast('Error generating attack: ' + e.message, true);
1014
  } finally {
1015
  setLoading(btnAutoAttack, false);
1016
  if (!episodeDone) {
1017
- btnStep.disabled = false;
1018
  btnAutoAttack.disabled = false;
1019
  }
1020
  }
 
936
  }
937
 
938
  /* ── Execute attack step ─────────────────────────────────── */
939
+ async function executeStep(calledFromAuto = false) {
 
 
940
  const framingEl = document.getElementById('framing');
941
  const framing = framingEl.value.trim();
942
  if (!framing) {
 
955
  framing,
956
  };
957
 
 
958
  appendAttackerMsg(framing, action.strategy_type, action.target_category, action.intensity.toFixed(2));
959
 
960
+ if (!calledFromAuto) {
961
+ setLoading(btnStep, true);
962
+ btnStep.disabled = true;
963
+ }
964
 
965
  try {
966
  const data = await api('POST', '/step', action);
 
970
  appendDefenderMsg(obs.defender_response, obs.attack_success_estimate, obs.defense_score);
971
  updateMetrics(obs, rw);
972
  updateHeader(obs, null);
973
+ document.getElementById('framing').value = '';
974
 
975
  if (obs.episode_done) {
976
  episodeDone = true;
977
  episodeActive = false;
978
  setStatus('done');
979
+ btnStep.disabled = true;
980
  btnAutoAttack.disabled = true;
981
+ btnGrade.disabled = false;
982
  appendSystemMsg('Episode complete. Grade your performance.');
983
  feedbackTxt.textContent = obs.feedback;
984
  }
985
  } catch (e) {
986
  toast('Error: ' + e.message, true);
987
+ if (!calledFromAuto) btnStep.disabled = false;
988
  } finally {
989
+ if (!calledFromAuto) {
990
+ setLoading(btnStep, false);
991
+ if (episodeDone) btnStep.disabled = true;
 
992
  }
993
  }
994
  }
995
 
996
+ btnStep.addEventListener('click', () => executeStep(false));
997
+
998
  /* ── Execute auto attack ─────────────────────────────────── */
999
  btnAutoAttack.addEventListener('click', executeAutoAttack);
1000
 
 
1003
  const category = document.getElementById('category').value;
1004
 
1005
  setLoading(btnAutoAttack, true);
1006
+ btnStep.disabled = true;
1007
  btnAutoAttack.disabled = true;
1008
 
1009
  try {
1010
  const data = await api('POST', '/auto-attack', { strategy_type: strategy, target_category: category });
1011
  document.getElementById('framing').value = data.framing;
1012
+ await executeStep(true);
1013
  } catch (e) {
1014
  toast('Error generating attack: ' + e.message, true);
1015
  } finally {
1016
  setLoading(btnAutoAttack, false);
1017
  if (!episodeDone) {
1018
+ btnStep.disabled = false;
1019
  btnAutoAttack.disabled = false;
1020
  }
1021
  }
inference.py CHANGED
@@ -30,7 +30,8 @@ HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("GROQ_API_KEY", "")
30
  SERVER_URL = os.getenv("SERVER_URL", "https://rayugacodes-breach-os.hf.space")
31
 
32
  BENCHMARK = "breach-os"
33
- MAX_STEPS = 10
 
34
 
35
  # ── LLM client ────────────────────────────────────────────────────────────────
36
  client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN)
@@ -94,7 +95,7 @@ async def run_task(task: str) -> None:
94
  defender_resp = obs["defender_response"]
95
  prev_success = 0.0
96
 
97
- while step < MAX_STEPS:
98
  step += 1
99
  action = generate_attack(defender_resp, step, prev_success)
100
  action_str = f"strategy={action['strategy_type']} category={action['target_category']} intensity={action['intensity']}"
 
30
  SERVER_URL = os.getenv("SERVER_URL", "https://rayugacodes-breach-os.hf.space")
31
 
32
  BENCHMARK = "breach-os"
33
+
34
+ TASK_MAX_STEPS = {"easy": 5, "medium": 8, "hard": 10}
35
 
36
  # ── LLM client ────────────────────────────────────────────────────────────────
37
  client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN)
 
95
  defender_resp = obs["defender_response"]
96
  prev_success = 0.0
97
 
98
+ while step < TASK_MAX_STEPS[task]:
99
  step += 1
100
  action = generate_attack(defender_resp, step, prev_success)
101
  action_str = f"strategy={action['strategy_type']} category={action['target_category']} intensity={action['intensity']}"
llm/pipeline.py CHANGED
@@ -162,9 +162,12 @@ async def run_llm_pipeline(action: Any, conversation_history: list[dict]) -> dic
162
  safety_flagged β€” True if dangerous keywords were detected
163
  llm_grade β€” summary dict for logging/grading
164
  """
165
- # An empty conversation_history signals a fresh episode β€” reset memory
166
  if not conversation_history:
167
  conversation.reset()
 
 
 
168
 
169
  attack_text = getattr(action, "framing", str(action))
170
 
@@ -191,6 +194,10 @@ async def run_llm_pipeline(action: Any, conversation_history: list[dict]) -> dic
191
  }
192
 
193
 
 
 
 
 
194
  # Keep old aliases so imports in other files don't break
195
  _conv = conversation
196
  _check_safety_flag = _is_dangerous
 
162
  safety_flagged β€” True if dangerous keywords were detected
163
  llm_grade β€” summary dict for logging/grading
164
  """
165
+ # Reset conversation on the first turn of each episode
166
  if not conversation_history:
167
  conversation.reset()
168
+ elif conversation.turn_count == 0:
169
+ # Handles case where reset() wasn't triggered (e.g. multi-task runs)
170
+ conversation.reset()
171
 
172
  attack_text = getattr(action, "framing", str(action))
173
 
 
194
  }
195
 
196
 
197
+ def reset_conversation() -> None:
198
+ """Explicitly reset the conversation β€” call this on every /reset."""
199
+ conversation.reset()
200
+
201
  # Keep old aliases so imports in other files don't break
202
  _conv = conversation
203
  _check_safety_flag = _is_dangerous
openenv.yaml CHANGED
@@ -1,6 +1,8 @@
1
  name: BreachOS
2
  version: "0.1.0"
3
  description: "AI Red-Teaming Environment for Safety Research"
 
 
4
 
5
  tasks:
6
  easy:
 
1
  name: BreachOS
2
  version: "0.1.0"
3
  description: "AI Red-Teaming Environment for Safety Research"
4
+ author: "X-MEN"
5
+ space_url: "https://huggingface.co/spaces/Rayugacodes/Breach-OS"
6
 
7
  tasks:
8
  easy:
server/app.py CHANGED
@@ -9,7 +9,7 @@ from server.environment import RedTeamEnvironment
9
  from server.config import get_settings
10
 
11
  from rewards.compute_rewards import RewardComputer
12
- from llm.pipeline import run_llm_pipeline
13
  from llm.automated_attacker import generate_automated_attack
14
 
15
  env: RedTeamEnvironment = None
@@ -54,6 +54,7 @@ async def health_check():
54
  @app.post("/reset", response_model=ResetResponse)
55
  async def reset_episode():
56
  try:
 
57
  observation = await env.reset()
58
  return ResetResponse(observation=observation, episode_id=observation.episode_id)
59
  except Exception as e:
 
9
  from server.config import get_settings
10
 
11
  from rewards.compute_rewards import RewardComputer
12
+ from llm.pipeline import run_llm_pipeline, reset_conversation
13
  from llm.automated_attacker import generate_automated_attack
14
 
15
  env: RedTeamEnvironment = None
 
54
  @app.post("/reset", response_model=ResetResponse)
55
  async def reset_episode():
56
  try:
57
+ reset_conversation()
58
  observation = await env.reset()
59
  return ResetResponse(observation=observation, episode_id=observation.episode_id)
60
  except Exception as e: