menimeni123
commited on
Commit
•
b2eddc3
1
Parent(s):
60fbaa9
latest
Browse files- handler.py +17 -7
handler.py
CHANGED
@@ -36,14 +36,24 @@ class EndpointHandler:
|
|
36 |
|
37 |
# Additional analysis
|
38 |
entropy = -np.sum(probabilities * np.log(probabilities + 1e-9))
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
adjusted_confidence = confidence * (1 - entropy/np.log(len(probabilities))) * max_prob_ratio
|
43 |
|
44 |
-
#
|
45 |
-
|
46 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
|
48 |
return {
|
49 |
"label": predicted_label,
|
|
|
36 |
|
37 |
# Additional analysis
|
38 |
entropy = -np.sum(probabilities * np.log(probabilities + 1e-9))
|
39 |
+
|
40 |
+
# Adjust confidence based on entropy
|
41 |
+
adjusted_confidence = confidence * (1 - entropy/np.log(len(probabilities)))
|
|
|
42 |
|
43 |
+
# Post-processing to better distinguish between INJECTION and JAILBREAK
|
44 |
+
injection_keywords = ['ignore', 'previous', 'instructions', 'don\'t', 'matter']
|
45 |
+
jailbreak_keywords = ['bypass', 'restrictions', 'override', 'security']
|
46 |
+
|
47 |
+
injection_score = sum(keyword in text.lower() for keyword in injection_keywords) / len(injection_keywords)
|
48 |
+
jailbreak_score = sum(keyword in text.lower() for keyword in jailbreak_keywords) / len(jailbreak_keywords)
|
49 |
+
|
50 |
+
if predicted_label in ['INJECTION', 'JAILBREAK']:
|
51 |
+
if injection_score > jailbreak_score:
|
52 |
+
predicted_label = 'INJECTION'
|
53 |
+
elif jailbreak_score > injection_score:
|
54 |
+
predicted_label = 'JAILBREAK'
|
55 |
+
|
56 |
+
adjusted_confidence = max(adjusted_confidence, injection_score, jailbreak_score)
|
57 |
|
58 |
return {
|
59 |
"label": predicted_label,
|