menimeni123 commited on
Commit
b2eddc3
1 Parent(s): 60fbaa9
Files changed (1) hide show
  1. handler.py +17 -7
handler.py CHANGED
@@ -36,14 +36,24 @@ class EndpointHandler:
36
 
37
  # Additional analysis
38
  entropy = -np.sum(probabilities * np.log(probabilities + 1e-9))
39
- max_prob_ratio = np.max(probabilities) / np.sort(probabilities)[-2]
40
-
41
- # Adjust confidence based on entropy and probability ratio
42
- adjusted_confidence = confidence * (1 - entropy/np.log(len(probabilities))) * max_prob_ratio
43
 
44
- # Lower the confidence for very short inputs
45
- if len(text.split()) < 4:
46
- adjusted_confidence *= 0.5
 
 
 
 
 
 
 
 
 
 
 
47
 
48
  return {
49
  "label": predicted_label,
 
36
 
37
  # Additional analysis
38
  entropy = -np.sum(probabilities * np.log(probabilities + 1e-9))
39
+
40
+ # Adjust confidence based on entropy
41
+ adjusted_confidence = confidence * (1 - entropy/np.log(len(probabilities)))
 
42
 
43
+ # Post-processing to better distinguish between INJECTION and JAILBREAK
44
+ injection_keywords = ['ignore', 'previous', 'instructions', 'don\'t', 'matter']
45
+ jailbreak_keywords = ['bypass', 'restrictions', 'override', 'security']
46
+
47
+ injection_score = sum(keyword in text.lower() for keyword in injection_keywords) / len(injection_keywords)
48
+ jailbreak_score = sum(keyword in text.lower() for keyword in jailbreak_keywords) / len(jailbreak_keywords)
49
+
50
+ if predicted_label in ['INJECTION', 'JAILBREAK']:
51
+ if injection_score > jailbreak_score:
52
+ predicted_label = 'INJECTION'
53
+ elif jailbreak_score > injection_score:
54
+ predicted_label = 'JAILBREAK'
55
+
56
+ adjusted_confidence = max(adjusted_confidence, injection_score, jailbreak_score)
57
 
58
  return {
59
  "label": predicted_label,