geekyrakshit commited on
Commit
fdb575d
1 Parent(s): 2633ee9

add: PromptInjectionLlamaGuardrail

Browse files
guardrails_genie/guardrails/__init__.py CHANGED
@@ -6,6 +6,7 @@ from guardrails_genie.guardrails.entity_recognition import (
6
  )
7
  from guardrails_genie.guardrails.injection import (
8
  PromptInjectionClassifierGuardrail,
 
9
  PromptInjectionSurveyGuardrail,
10
  )
11
  from guardrails_genie.guardrails.secrets_detection import SecretsDetectionGuardrail
@@ -13,6 +14,7 @@ from guardrails_genie.guardrails.secrets_detection import SecretsDetectionGuardr
13
  from .manager import GuardrailManager
14
 
15
  __all__ = [
 
16
  "PromptInjectionSurveyGuardrail",
17
  "PromptInjectionClassifierGuardrail",
18
  "PresidioEntityRecognitionGuardrail",
 
6
  )
7
  from guardrails_genie.guardrails.injection import (
8
  PromptInjectionClassifierGuardrail,
9
+ PromptInjectionLlamaGuardrail,
10
  PromptInjectionSurveyGuardrail,
11
  )
12
  from guardrails_genie.guardrails.secrets_detection import SecretsDetectionGuardrail
 
14
  from .manager import GuardrailManager
15
 
16
  __all__ = [
17
+ "PromptInjectionLlamaGuardrail",
18
  "PromptInjectionSurveyGuardrail",
19
  "PromptInjectionClassifierGuardrail",
20
  "PresidioEntityRecognitionGuardrail",
guardrails_genie/guardrails/injection/__init__.py CHANGED
@@ -1,4 +1,9 @@
1
  from .classifier_guardrail import PromptInjectionClassifierGuardrail
 
2
  from .survey_guardrail import PromptInjectionSurveyGuardrail
3
 
4
- __all__ = ["PromptInjectionSurveyGuardrail", "PromptInjectionClassifierGuardrail"]
 
 
 
 
 
1
  from .classifier_guardrail import PromptInjectionClassifierGuardrail
2
+ from .llama_prompt_guardrail import PromptInjectionLlamaGuardrail
3
  from .survey_guardrail import PromptInjectionSurveyGuardrail
4
 
5
+ __all__ = [
6
+ "PromptInjectionLlamaGuardrail",
7
+ "PromptInjectionSurveyGuardrail",
8
+ "PromptInjectionClassifierGuardrail",
9
+ ]
guardrails_genie/guardrails/injection/llama_prompt_guardrail.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional
2
+
3
+ import torch
4
+ import torch.nn.functional as F
5
+ import weave
6
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer
7
+
8
+ from ..base import Guardrail
9
+
10
+
11
+ class PromptInjectionLlamaGuardrail(Guardrail):
12
+ model_name: str = "meta-llama/Prompt-Guard-86M"
13
+ max_sequence_length: int = 512
14
+ temperature: float = 1.0
15
+ jailbreak_score_threshold: float = 0.5
16
+ indirect_injection_score_threshold: float = 0.5
17
+ device: str = "cuda" if torch.cuda.is_available() else "cpu"
18
+ _tokenizer: Optional[AutoTokenizer] = None
19
+ _model: Optional[AutoModelForSequenceClassification] = None
20
+
21
+ def model_post_init(self, __context):
22
+ self._tokenizer = AutoTokenizer.from_pretrained(self.model_name)
23
+ self._model = AutoModelForSequenceClassification.from_pretrained(
24
+ self.model_name
25
+ )
26
+
27
+ def get_class_probabilities(self, prompt):
28
+ inputs = self._tokenizer(
29
+ prompt,
30
+ return_tensors="pt",
31
+ padding=True,
32
+ truncation=True,
33
+ max_length=self.max_sequence_length,
34
+ )
35
+ inputs = inputs.to(self.device)
36
+ with torch.no_grad():
37
+ logits = self._model(**inputs).logits
38
+ scaled_logits = logits / self.temperature
39
+ probabilities = F.softmax(scaled_logits, dim=-1)
40
+ return probabilities
41
+
42
+ @weave.op()
43
+ def get_score(self, prompt: str):
44
+ probabilities = self.get_class_probabilities(prompt)
45
+ return {
46
+ "jailbreak_score": probabilities[0, 2].item(),
47
+ "indirect_injection_score": (
48
+ probabilities[0, 1] + probabilities[0, 2]
49
+ ).item(),
50
+ }
51
+
52
+ @weave.op()
53
+ def guard(self, prompt: str):
54
+ score = self.get_score(prompt)
55
+ summary = ""
56
+ if score["jailbreak_score"] > self.jailbreak_score_threshold:
57
+ confidence = round(score["jailbreak_score"] * 100, 2)
58
+ summary += f"Prompt is deemed to be a jailbreak attempt with {confidence}% confidence."
59
+ if score["indirect_injection_score"] > self.indirect_injection_score_threshold:
60
+ confidence = round(score["indirect_injection_score"] * 100, 2)
61
+ summary += f"Prompt is deemed to be an indirect injection attempt with {confidence}% confidence."
62
+ return {
63
+ "safe": score["jailbreak_score"] < self.jailbreak_score_threshold
64
+ and score["indirect_injection_score"]
65
+ < self.indirect_injection_score_threshold,
66
+ "summary": summary,
67
+ }
68
+
69
+ @weave.op()
70
+ def predict(self, prompt: str):
71
+ return self.guard(prompt)