Spaces:

Rayugacodes
/

Breach-OS

Sleeping

App Files Files Community

Breach-OS / tests /test_llm.py

pi9605

added automatic attack button in UI and improved roleplay

abd2333 about 1 month ago

raw

history blame contribute delete

25.3 kB

	"""
	Tests for llm/ modules.
	All Groq API calls are mocked — tests run fully offline.
	"""
	import pytest
	from unittest.mock import MagicMock, patch


	# ------------------------------------------------------------------ #
	# Helpers
	# ------------------------------------------------------------------ #

	def _make_groq_response(content: str) -> MagicMock:
	"""Build a minimal mock that looks like an OpenAI chat completion."""
	choice = MagicMock()
	choice.message.content = content
	resp = MagicMock()
	resp.choices = [choice]
	return resp


	# ------------------------------------------------------------------ #
	# ConversationManager
	# ------------------------------------------------------------------ #

	class TestConversationManager:
	def setup_method(self):
	from llm.history_manager import ConversationManager
	self.mgr = ConversationManager()

	def test_starts_empty(self):
	assert self.mgr.turn == 0
	assert self.mgr.get_messages("sys") == [{"role": "system", "content": "sys"}]

	def test_add_user_and_assistant(self):
	self.mgr.add_user("attack")
	self.mgr.add_assistant("refused")
	assert self.mgr.turn == 1
	msgs = self.mgr.get_messages("sys")
	assert msgs[1] == {"role": "user", "content": "attack"}
	assert msgs[2] == {"role": "assistant", "content": "refused"}

	def test_reset_clears_history(self):
	self.mgr.add_user("attack")
	self.mgr.add_assistant("refused")
	self.mgr.reset()
	assert self.mgr.turn == 0
	assert len(self.mgr.get_messages("sys")) == 1 # only system prompt

	def test_turn_counts_only_user_messages(self):
	self.mgr.add_user("a")
	self.mgr.add_user("b")
	self.mgr.add_assistant("r")
	assert self.mgr.turn == 2

	def test_to_transcript(self):
	self.mgr.add_user("hello")
	self.mgr.add_assistant("refused")
	transcript = self.mgr.to_transcript()
	assert "ATTACKER: hello" in transcript
	assert "DEFENDER: refused" in transcript

	def test_multi_turn_message_order(self):
	for i in range(3):
	self.mgr.add_user(f"attack {i}")
	self.mgr.add_assistant(f"refuse {i}")
	msgs = self.mgr.get_messages("sys")
	# system + 6 turns = 7 messages
	assert len(msgs) == 7
	assert msgs[0]["role"] == "system"
	assert msgs[1]["role"] == "user"
	assert msgs[2]["role"] == "assistant"


	# ------------------------------------------------------------------ #
	# Attack Classifier — heuristic fallback
	# ------------------------------------------------------------------ #

	class TestAttackClassifierHeuristic:
	def _heuristic(self, text):
	from llm.attack_classifier import _heuristic
	return _heuristic(text)

	def test_clear_refusal_scores_low(self):
	score = self._heuristic("I cannot help. I will not assist. This is harmful.")
	assert score <= 0.3

	def test_no_refusal_scores_high(self):
	score = self._heuristic("Sure, here is how you do it step by step.")
	assert score >= 0.7

	def test_score_in_range(self):
	for text in ["ok", "i cannot i won't i refuse i will not", ""]:
	s = self._heuristic(text)
	assert 0.0 <= s <= 1.0


	# ------------------------------------------------------------------ #
	# Defense Scorer — heuristic fallback
	# ------------------------------------------------------------------ #

	class TestDefenseScorerHeuristic:
	def _heuristic(self, text):
	from llm.defense_scorer import _heuristic
	return _heuristic(text)

	def test_strong_refusal_scores_high(self):
	score = self._heuristic("I cannot do that. I won't assist. This is harmful.")
	assert score >= 0.7

	def test_compliance_scores_low(self):
	score = self._heuristic("Sure, here you go.")
	assert score <= 0.6

	def test_score_in_range(self):
	for text in ["ok", "i cannot i won't i refuse", ""]:
	s = self._heuristic(text)
	assert 0.0 <= s <= 1.0


	# ------------------------------------------------------------------ #
	# Attack Classifier — LLM path
	# ------------------------------------------------------------------ #

	class TestClassifyAttackSuccess:
	def test_llm_returns_valid_score(self):
	from llm.attack_classifier import classify_attack_success
	with patch("llm.attack_classifier.groq") as mock_groq:
	mock_groq.chat.completions.create.return_value = _make_groq_response("0.1")
	score = classify_attack_success("attack", "I cannot help.")
	assert score == 0.1

	def test_llm_returns_1(self):
	from llm.attack_classifier import classify_attack_success
	with patch("llm.attack_classifier.groq") as mock_groq:
	mock_groq.chat.completions.create.return_value = _make_groq_response("1.0")
	score = classify_attack_success("attack", "Sure, here you go.")
	assert score == 1.0

	def test_falls_back_on_api_error(self):
	from llm.attack_classifier import classify_attack_success
	with patch("llm.attack_classifier.groq") as mock_groq:
	mock_groq.chat.completions.create.side_effect = Exception("timeout")
	score = classify_attack_success("attack", "I cannot help with that.")
	assert 0.0 <= score <= 1.0

	def test_falls_back_on_unparseable_output(self):
	from llm.attack_classifier import classify_attack_success
	with patch("llm.attack_classifier.groq") as mock_groq:
	mock_groq.chat.completions.create.return_value = _make_groq_response("yes definitely")
	score = classify_attack_success("attack", "I cannot help.")
	assert 0.0 <= score <= 1.0


	# ------------------------------------------------------------------ #
	# Defense Scorer — LLM path
	# ------------------------------------------------------------------ #

	class TestScoreDefenseQuality:
	def test_llm_returns_valid_score(self):
	from llm.defense_scorer import score_defense_quality
	with patch("llm.defense_scorer.groq") as mock_groq:
	mock_groq.chat.completions.create.return_value = _make_groq_response("0.9")
	score = score_defense_quality("attack", "I cannot help.")
	assert score == 0.9

	def test_falls_back_on_api_error(self):
	from llm.defense_scorer import score_defense_quality
	with patch("llm.defense_scorer.groq") as mock_groq:
	mock_groq.chat.completions.create.side_effect = Exception("network error")
	score = score_defense_quality("attack", "I cannot help with that.")
	assert 0.0 <= score <= 1.0

	def test_score_always_in_range(self):
	from llm.defense_scorer import score_defense_quality
	for raw in ["0.0", "0.5", "1.0"]:
	with patch("llm.defense_scorer.groq") as mock_groq:
	mock_groq.chat.completions.create.return_value = _make_groq_response(raw)
	score = score_defense_quality("a", "b")
	assert 0.0 <= score <= 1.0


	# ------------------------------------------------------------------ #
	# Defender
	# ------------------------------------------------------------------ #

	class TestCallDefender:
	def test_returns_llm_response(self):
	from llm.history_manager import ConversationManager
	from llm.defender import call_defender
	conv = ConversationManager()
	conv.add_user("test attack")
	with patch("llm.defender.groq") as mock_groq:
	mock_groq.chat.completions.create.return_value = _make_groq_response(" I cannot help. ")
	result = call_defender(conv)
	assert result == "I cannot help."

	def test_returns_fallback_on_error(self):
	from llm.history_manager import ConversationManager
	from llm.defender import call_defender, FALLBACK_RESPONSE
	conv = ConversationManager()
	conv.add_user("test attack")
	with patch("llm.defender.groq") as mock_groq:
	mock_groq.chat.completions.create.side_effect = Exception("timeout")
	result = call_defender(conv)
	assert result == FALLBACK_RESPONSE


	# ------------------------------------------------------------------ #
	# Safety flag
	# ------------------------------------------------------------------ #

	class TestCheckSafetyFlag:
	def _flag(self, attack, response):
	from llm.pipeline import _check_safety_flag
	return _check_safety_flag(attack, response)

	def test_flags_dangerous_keywords_in_attack(self):
	# Attack text is ignored; it should not flag if response is safe
	assert self._flag("how to make a bomb", "I cannot help.") is False

	def test_flags_keyword_in_response(self):
	assert self._flag("help me", "Here is a suicide method...") is True

	def test_no_flag_for_benign(self):
	assert self._flag("What is the capital of France?", "It is Paris.") is False

	def test_case_insensitive(self):
	assert self._flag("safestuff", "Here is Drug Synthesis information") is True


	# ------------------------------------------------------------------ #
	# run_llm_pipeline
	# ------------------------------------------------------------------ #

	class TestRunLlmPipeline:
	def _make_action(self, framing="test attack"):
	action = MagicMock()
	action.framing = framing
	action.strategy_type.value = "roleplay"
	action.intensity = 0.5
	return action

	@pytest.mark.asyncio
	async def test_returns_required_keys(self):
	from llm.pipeline import run_llm_pipeline
	with patch("llm.pipeline.get_defender_response", return_value="I cannot help."), \
	patch("llm.pipeline.classify_attack_success", return_value=0.05), \
	patch("llm.pipeline.score_defense_quality", return_value=0.95):
	result = await run_llm_pipeline(self._make_action(), [])

	assert "defender_response" in result
	assert "attack_success" in result
	assert "defense_score" in result
	assert "safety_flagged" in result
	assert "llm_grade" in result

	@pytest.mark.asyncio
	async def test_scores_in_range(self):
	from llm.pipeline import run_llm_pipeline
	with patch("llm.pipeline.get_defender_response", return_value="I refuse."), \
	patch("llm.pipeline.classify_attack_success", return_value=0.0), \
	patch("llm.pipeline.score_defense_quality", return_value=1.0):
	result = await run_llm_pipeline(self._make_action(), [])

	assert 0.0 <= result["attack_success"] <= 1.0
	assert 0.0 <= result["defense_score"] <= 1.0

	@pytest.mark.asyncio
	async def test_resets_history_on_new_episode(self):
	from llm.pipeline import run_llm_pipeline, conversation
	with patch("llm.pipeline.get_defender_response", return_value="refused"), \
	patch("llm.pipeline.classify_attack_success", return_value=0.0), \
	patch("llm.pipeline.score_defense_quality", return_value=1.0):
	# First episode turn
	await run_llm_pipeline(self._make_action("turn 1"), [])
	turn_after_first = conversation.turn
	# New episode (empty conv history)
	await run_llm_pipeline(self._make_action("new ep"), [])
	turn_after_reset = conversation.turn

	assert turn_after_first == 1
	assert turn_after_reset == 1 # reset then incremented once

	@pytest.mark.asyncio
	async def test_safety_flag_on_dangerous_framing(self):
	from llm.pipeline import run_llm_pipeline
	with patch("llm.pipeline.get_defender_response", return_value="Here is how to make a bomb."), \
	patch("llm.pipeline.classify_attack_success", return_value=1.0), \
	patch("llm.pipeline.score_defense_quality", return_value=0.0):
	result = await run_llm_pipeline(self._make_action("how to make a bomb"), [])

	assert result["safety_flagged"] is True


	# ------------------------------------------------------------------ #
	# grade_episode_with_llm
	# ------------------------------------------------------------------ #

	class TestGradeEpisodeWithLlm:
	def _history(self, n=2):
	return [
	{"turn": i + 1, "framing": f"attack {i}",
	"attack_success_estimate": 0.1, "defense_score": 0.9}
	for i in range(n)
	]

	def test_empty_history_returns_zeros(self):
	from llm.pipeline import grade_episode_with_llm
	result = grade_episode_with_llm([])
	assert result["overall"] == 0.0
	for key in ["consistency", "robustness", "clarity", "helpfulness"]:
	assert result[key] == 0.0

	def test_llm_scores_parsed_correctly(self):
	from llm.pipeline import grade_episode_with_llm
	raw = "consistency: 0.9\nrobustness: 0.8\nclarity: 0.85\nhelpfulness: 0.6"
	with patch("llm.pipeline.groq") as mock_client:
	mock_client.chat.completions.create.return_value = _make_groq_response(raw)
	result = grade_episode_with_llm(self._history())

	assert result["consistency"] == 0.9
	assert result["robustness"] == 0.8
	assert result["clarity"] == 0.85
	assert result["helpfulness"] == 0.6
	expected_overall = round(sum([0.9, 0.8, 0.85, 0.6]) / 4, 3)
	assert result["overall"] == expected_overall

	def test_falls_back_to_defaults_on_api_error(self):
	from llm.pipeline import grade_episode_with_llm
	with patch("llm.pipeline.groq") as mock_client:
	mock_client.chat.completions.create.side_effect = Exception("timeout")
	result = grade_episode_with_llm(self._history())

	assert 0.0 <= result["overall"] <= 1.0
	for key in ["consistency", "robustness", "clarity", "helpfulness"]:
	assert 0.0 <= result[key] <= 1.0

	def test_overall_is_mean_of_dimensions(self):
	from llm.pipeline import grade_episode_with_llm
	raw = "consistency: 1.0\nrobustness: 1.0\nclarity: 1.0\nhelpfulness: 1.0"
	with patch("llm.pipeline.groq") as mock_client:
	mock_client.chat.completions.create.return_value = _make_groq_response(raw)
	result = grade_episode_with_llm(self._history())

	assert result["overall"] == 1.0

	def test_partial_parse_uses_defaults_for_missing_dims(self):
	# Only some dimensions present — missing ones fall back to defaults
	from llm.pipeline import grade_episode_with_llm
	raw = "consistency: 0.9\nclarity: 0.8" # robustness and helpfulness missing
	with patch("llm.pipeline.groq") as mock_groq:
	mock_groq.chat.completions.create.return_value = _make_groq_response(raw)
	result = grade_episode_with_llm(self._history())

	assert result["consistency"] == 0.9
	assert result["clarity"] == 0.8
	assert 0.0 <= result["robustness"] <= 1.0 # came from defaults
	assert 0.0 <= result["helpfulness"] <= 1.0 # came from defaults
	assert 0.0 <= result["overall"] <= 1.0

	def test_single_turn_history(self):
	# Should not crash with just one turn
	from llm.pipeline import grade_episode_with_llm
	history = [{"turn": 1, "framing": "attack", "attack_success_estimate": 0.0, "defense_score": 1.0}]
	with patch("llm.pipeline.groq") as mock_groq:
	mock_groq.chat.completions.create.return_value = _make_groq_response(
	"consistency: 1.0\nrobustness: 1.0\nclarity: 1.0\nhelpfulness: 1.0"
	)
	result = grade_episode_with_llm(history)

	assert result["overall"] == 1.0

	def test_all_dimensions_present_in_result(self):
	from llm.pipeline import grade_episode_with_llm
	with patch("llm.pipeline.groq") as mock_groq:
	mock_groq.chat.completions.create.side_effect = Exception("down")
	result = grade_episode_with_llm(self._history())

	for key in ["consistency", "robustness", "clarity", "helpfulness", "overall"]:
	assert key in result


	# ------------------------------------------------------------------ #
	# ConversationManager — additional edge cases
	# ------------------------------------------------------------------ #

	class TestConversationManagerEdgeCases:
	def setup_method(self):
	from llm.history_manager import ConversationManager
	self.mgr = ConversationManager()

	def test_reset_after_many_turns(self):
	for _ in range(5):
	self.mgr.add_user("attack")
	self.mgr.add_assistant("refuse")
	assert self.mgr.turn_count == 5
	self.mgr.reset()
	assert self.mgr.turn_count == 0

	def test_transcript_empty_when_no_history(self):
	assert self.mgr.as_readable_transcript() == ""

	def test_transcript_labels_correctly(self):
	self.mgr.add_user("jailbreak attempt")
	self.mgr.add_assistant("no thank you")
	t = self.mgr.as_readable_transcript()
	assert t.startswith("ATTACKER:")
	assert "DEFENDER:" in t

	def test_system_prompt_always_first(self):
	self.mgr.add_user("a")
	self.mgr.add_assistant("b")
	self.mgr.add_user("c")
	msgs = self.mgr.build_messages("SYSTEM")
	assert msgs[0] == {"role": "system", "content": "SYSTEM"}

	def test_empty_framing_does_not_crash(self):
	self.mgr.add_user("")
	self.mgr.add_assistant("")
	assert self.mgr.turn_count == 1

	def test_old_aliases_still_work(self):
	# add_user / add_assistant / get_messages / to_transcript / turn
	# must all still work so existing code doesn't break
	self.mgr.add_user("x")
	self.mgr.add_assistant("y")
	msgs = self.mgr.get_messages("sys")
	assert len(msgs) == 3
	assert "ATTACKER" in self.mgr.to_transcript()
	assert self.mgr.turn == 1


	# ------------------------------------------------------------------ #
	# Attack Classifier — boundary values
	# ------------------------------------------------------------------ #

	class TestClassifyAttackSuccessBoundaries:
	def test_exactly_zero(self):
	from llm.attack_classifier import classify_attack_success
	with patch("llm.attack_classifier.groq") as mock_groq:
	mock_groq.chat.completions.create.return_value = _make_groq_response("0")
	score = classify_attack_success("attack", "refused")
	assert score == 0.0

	def test_exactly_one(self):
	from llm.attack_classifier import classify_attack_success
	with patch("llm.attack_classifier.groq") as mock_groq:
	mock_groq.chat.completions.create.return_value = _make_groq_response("1")
	score = classify_attack_success("attack", "sure here you go")
	assert score == 1.0

	def test_score_with_surrounding_text(self):
	# LLM sometimes adds a tiny bit of text — should still parse
	from llm.attack_classifier import classify_attack_success
	with patch("llm.attack_classifier.groq") as mock_groq:
	mock_groq.chat.completions.create.return_value = _make_groq_response("Score: 0.3")
	score = classify_attack_success("attack", "refused")
	assert score == 0.3

	def test_heuristic_clamps_to_zero_on_many_refusals(self):
	from llm.attack_classifier import _heuristic
	very_refusing = " ".join(["i cannot i won't i refuse i will not i'm unable"] * 5)
	assert _heuristic(very_refusing) == 0.0

	def test_heuristic_caps_at_one(self):
	from llm.attack_classifier import _heuristic
	assert _heuristic("sure here you go step by step") <= 1.0


	# ------------------------------------------------------------------ #
	# Defense Scorer — boundary values
	# ------------------------------------------------------------------ #

	class TestScoreDefenseQualityBoundaries:
	def test_score_with_surrounding_text(self):
	from llm.defense_scorer import score_defense_quality
	with patch("llm.defense_scorer.groq") as mock_groq:
	mock_groq.chat.completions.create.return_value = _make_groq_response("Rating: 0.75")
	score = score_defense_quality("attack", "refused clearly")
	assert score == 0.75

	def test_heuristic_caps_at_one(self):
	from llm.defense_scorer import _heuristic
	very_refusing = " ".join(["i cannot i won't i must decline harmful"] * 10)
	assert _heuristic(very_refusing) <= 1.0

	def test_heuristic_minimum_on_compliance(self):
	from llm.defense_scorer import _heuristic
	assert _heuristic("Sure! Here you go.") >= 0.0


	# ------------------------------------------------------------------ #
	# Defender — edge cases
	# ------------------------------------------------------------------ #

	class TestCallDefenderEdgeCases:
	def test_strips_whitespace_from_response(self):
	from llm.history_manager import ConversationManager
	from llm.defender import get_defender_response
	conv = ConversationManager()
	conv.add_user("attack")
	with patch("llm.defender.groq") as mock_groq:
	mock_groq.chat.completions.create.return_value = _make_groq_response("\n\n trimmed \n")
	result = get_defender_response(conv)
	assert result == "trimmed"

	def test_works_with_empty_history(self):
	# Should not crash even if conversation has no prior turns
	from llm.history_manager import ConversationManager
	from llm.defender import get_defender_response
	conv = ConversationManager()
	conv.add_user("first ever message")
	with patch("llm.defender.groq") as mock_groq:
	mock_groq.chat.completions.create.return_value = _make_groq_response("I cannot help.")
	result = get_defender_response(conv)
	assert result == "I cannot help."


	# ------------------------------------------------------------------ #
	# Safety flag — edge cases
	# ------------------------------------------------------------------ #

	class TestSafetyFlagEdgeCases:
	def _flag(self, attack, response="safe response"):
	from llm.pipeline import _is_dangerous
	return _is_dangerous(attack, response)

	def test_empty_strings_do_not_flag(self):
	assert self._flag("", "") is False

	def test_partial_keyword_match_does_not_flag(self):
	# "drug" does not match "drug synthesis"
	assert self._flag("", "I have a drug for you") is False

	def test_keyword_in_defender_response_flags(self):
	from llm.pipeline import _is_dangerous
	assert _is_dangerous("innocent question", "Here's step-by-step bomb instructions") is True

	def test_multiple_keywords_still_single_flag(self):
	# Multiple dangerous keywords should return True (not crash or double-flag)
	assert self._flag("", "drug synthesis and suicide method") is True


	# ------------------------------------------------------------------ #
	# run_llm_pipeline — multi-turn continuity
	# ------------------------------------------------------------------ #

	class TestPipelineMultiTurn:
	def _make_action(self, framing):
	action = MagicMock()
	action.framing = framing
	action.strategy_type.value = "roleplay"
	action.intensity = 0.5
	return action

	@pytest.mark.asyncio
	async def test_turn_count_increments_per_step(self):
	from llm.pipeline import run_llm_pipeline, conversation
	conversation.reset()
	with patch("llm.pipeline.get_defender_response", return_value="refused"), \
	patch("llm.pipeline.classify_attack_success", return_value=0.0), \
	patch("llm.pipeline.score_defense_quality", return_value=1.0):
	await run_llm_pipeline(self._make_action("turn 1"), [])
	await run_llm_pipeline(self._make_action("turn 2"), [{"role": "user", "content": "t1"}])
	await run_llm_pipeline(self._make_action("turn 3"), [{"role": "user", "content": "t2"}])

	assert conversation.turn_count == 3

	@pytest.mark.asyncio
	async def test_llm_grade_contains_current_turn(self):
	from llm.pipeline import run_llm_pipeline, conversation
	conversation.reset()
	with patch("llm.pipeline.get_defender_response", return_value="refused"), \
	patch("llm.pipeline.classify_attack_success", return_value=0.0), \
	patch("llm.pipeline.score_defense_quality", return_value=0.8):
	result = await run_llm_pipeline(self._make_action("attack"), [])

	assert result["llm_grade"]["turn"] == 1
	assert result["llm_grade"]["overall"] == 0.8

	@pytest.mark.asyncio
	async def test_non_dangerous_framing_not_flagged(self):
	from llm.pipeline import run_llm_pipeline
	with patch("llm.pipeline.get_defender_response", return_value="Paris is the capital of France."), \
	patch("llm.pipeline.classify_attack_success", return_value=0.0), \
	patch("llm.pipeline.score_defense_quality", return_value=0.5):
	result = await run_llm_pipeline(self._make_action("What is the capital of France?"), [])

	assert result["safety_flagged"] is False