Spaces:
Running
Running
johnsonhung906
commited on
Commit
•
88e1260
1
Parent(s):
6792367
change header styles
Browse files- .DS_Store +0 -0
- assets/css/custom_style.css +0 -83
- assets/css/style.css +4 -4
- demo_results/normal_1.png +0 -0
- demo_results/normal_3.png +0 -0
- demo_results/normal_5.png +0 -0
- index.html +105 -128
- style.css +81 -55
.DS_Store
CHANGED
Binary files a/.DS_Store and b/.DS_Store differ
|
|
assets/css/custom_style.css
DELETED
@@ -1,83 +0,0 @@
|
|
1 |
-
@media screen and (min-width: 70em) { .main-content { max-width: 70rem; padding: 2rem 6rem; margin: 0 auto; font-size: 1.1rem; } }
|
2 |
-
@font-face { font-family: 'flexslider-icon'; src: url("../fonts/flexslider-icon.eot"); src: url("../fonts/flexslider-icon.eot?#iefix") format("embedded-opentype"), url("../fonts/flexslider-icon.woff") format("woff"), url("../fonts/flexslider-icon.ttf") format("truetype"), url("../fonts/flexslider-icon.svg#flexslider-icon") format("svg"); font-weight: normal; font-style: normal; }
|
3 |
-
header h1, header h2 { font-weight: normal; line-height: normal; }
|
4 |
-
|
5 |
-
header h2 { margin-top: .83em; }
|
6 |
-
|
7 |
-
.main-content p { text-align: justify; }
|
8 |
-
|
9 |
-
.jailbreak-intro-sec { width: 80%; margin: 1em auto; }
|
10 |
-
|
11 |
-
#refusal-loss-formula .formula { text-align: center; }
|
12 |
-
|
13 |
-
#refusal-loss-formula .formula-list { width: fit-content; margin: 0 auto; }
|
14 |
-
|
15 |
-
#refusal-loss-formula .formula-list a { display: inline-block; width: 250px; margin: 0 20px; padding: 8px 10px; text-align: center; background: #DDD; cursor: pointer; text-decoration: none; color: #333; border-radius: 10px; user-select: none; transition-duration: 0.3s; }
|
16 |
-
|
17 |
-
#jailbreak-demo .radio-group { margin-right: 5px; }
|
18 |
-
|
19 |
-
input[type='radio'] { visibility: hidden; display: none; }
|
20 |
-
|
21 |
-
#jailbreak-demo .radio-group .option-label { font-size: 1em; cursor: pointer; position: relative; padding: 0.1em 0.6em; border: 1px solid #999; background: #FFF; border-radius: 0.2em; transition: 0.2s; }
|
22 |
-
|
23 |
-
#jailbreak-demo .radio-group .options:checked ~ .option-label { color: #FFF; background: #777; }
|
24 |
-
|
25 |
-
#refusal-loss-formula .formula-list a:hover, #jailbreak-demo #defense-methods .defense:hover { background: #555; color: #FFF; }
|
26 |
-
|
27 |
-
#jailbreak-demo #defense-methods .options:checked ~ .defense { color: #FFF; background: #555; }
|
28 |
-
|
29 |
-
#jailbreak-demo #defense-methods .defense { display: inline-block; width: 60%; margin: 2% auto 8%; padding: 8px 10px; text-align: center; background: #DDD; cursor: pointer; text-decoration: none; color: #333; border-radius: 10px; user-select: none; transition-duration: 0.3s; }
|
30 |
-
|
31 |
-
#jailbreak-demo .legend { text-align: center; width: 70%; margin: 0 auto; }
|
32 |
-
|
33 |
-
#jailbreak-demo .figure-option { text-align: center; width: 70%; margin: 4% auto 0; /* Customize the label (the container) */ /* Hide the browser's default checkbox */ /* Create a custom checkbox */ /* On mouse-over, add a grey background color */ /* When the checkbox is checked, add a blue background */ /* Create the checkmark/indicator (hidden when not checked) */ /* Show the checkmark when checked */ /* Style the checkmark/indicator */ }
|
34 |
-
#jailbreak-demo .figure-option .container { display: block; position: relative; padding-left: 35px; margin-bottom: 12px; cursor: pointer; font-size: 22px; -webkit-user-select: none; -moz-user-select: none; -ms-user-select: none; user-select: none; }
|
35 |
-
#jailbreak-demo .figure-option .container input { position: absolute; opacity: 0; cursor: pointer; height: 0; width: 0; }
|
36 |
-
#jailbreak-demo .figure-option .checkmark { position: absolute; top: 4px; left: 8px; height: 25px; width: 25px; background-color: #eee; }
|
37 |
-
#jailbreak-demo .figure-option .container:hover input ~ .checkmark { background-color: #ccc; }
|
38 |
-
#jailbreak-demo .figure-option .container input:checked ~ .checkmark { background-color: #9b9bff; }
|
39 |
-
#jailbreak-demo .figure-option .checkmark:after { content: ""; position: absolute; display: none; }
|
40 |
-
#jailbreak-demo .figure-option .container input:checked ~ .checkmark:after { display: block; }
|
41 |
-
#jailbreak-demo .figure-option .container .checkmark:after { left: 9px; top: 5px; width: 5px; height: 10px; border: solid white; border-width: 0 3px 3px 0; -webkit-transform: rotate(45deg); -ms-transform: rotate(45deg); transform: rotate(45deg); }
|
42 |
-
|
43 |
-
#jailbreak-demo .figure { margin: 0 auto; display: block; }
|
44 |
-
|
45 |
-
#jailbreak-demo .figure #original { display: none; }
|
46 |
-
|
47 |
-
#jailbreak-demo .figure img { user-drag: none; -webkit-user-drag: none; user-select: none; -khtml-user-drag: none; -moz-user-drag: none; -o-user-drag: none; pointer-events: none; position: relative; left: 35px; }
|
48 |
-
|
49 |
-
#jailbreak-demo .figure-caption { width: 240px; text-align: center; display: block; margin: 0 auto; padding: 10px 0 0; font-size: .8em; }
|
50 |
-
|
51 |
-
#jailbreak-demo .figure-caption ul { padding-left: 0; }
|
52 |
-
|
53 |
-
#jailbreak-demo .figure-caption ul li { list-style: none; }
|
54 |
-
|
55 |
-
#jailbreak-demo .figure-caption .model-prediction { font-weight: bold; }
|
56 |
-
|
57 |
-
#jailbreak-demo .figure-caption .correct { color: #009926; }
|
58 |
-
|
59 |
-
#jailbreak-demo .figure-caption .wrong { color: #e31327; }
|
60 |
-
|
61 |
-
#jailbreak-demo .attack-success-rate { display: inline-block; width: 60%; margin: 2% auto 8%; padding: 8px 10px; text-align: center; text-decoration: none; background: #DDD; color: #333; border-radius: 10px; user-select: none; }
|
62 |
-
#jailbreak-demo .attack-success-rate .jailbreak-metric { font-size: 0.75em; display: block; }
|
63 |
-
#jailbreak-demo .attack-success-rate .attack-success-rate-value { font-size: 1.5em; font-family: "sans-serif"; color: #820000; }
|
64 |
-
|
65 |
-
#jailbreak-demo .benign-refusal-rate { display: inline-block; width: 60%; margin: 2% auto 8%; padding: 8px 10px; text-align: center; text-decoration: none; background: #DDD; color: #333; border-radius: 10px; user-select: none; }
|
66 |
-
#jailbreak-demo .benign-refusal-rate .jailbreak-metric { font-size: 0.75em; display: block; }
|
67 |
-
#jailbreak-demo .benign-refusal-rate .benign-refusal-rate-value { font-size: 1.5em; font-family: "sans-serif"; color: #820000; }
|
68 |
-
|
69 |
-
.warning-quote { padding: 15px; font-size: 0.8em; background-color: #f43636ba; color: white; margin-bottom: 15px; border-left: 5px solid #ff3030; transition-duration: 0.3s; }
|
70 |
-
|
71 |
-
.closebtn { margin-left: 15px; color: white; font-weight: bold; float: right; font-size: 6px; line-height: 20px; cursor: pointer; transition: 0.3s; }
|
72 |
-
|
73 |
-
/* When moving the mouse over the close button */
|
74 |
-
.closebtn:hover { color: black; }
|
75 |
-
|
76 |
-
.slider-container { display: block; margin-top: 1em; margin-bottom: 0.5em; float: left; }
|
77 |
-
|
78 |
-
.slider-label { width: 140px; float: left; line-height: 1; }
|
79 |
-
|
80 |
-
.slider-content { width: 450px; position: relative; float: right; }
|
81 |
-
|
82 |
-
#ppl-threshold, #gradient-norm-threshold { width: 3em; height: 1.6em; top: 50%; margin-top: -.8em; text-align: center; line-height: 1.6em; }
|
83 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
assets/css/style.css
CHANGED
@@ -4,7 +4,7 @@
|
|
4 |
html { font-family: sans-serif; /* 1 */ -ms-text-size-adjust: 100%; /* 2 */ -webkit-text-size-adjust: 100%; /* 2 */ }
|
5 |
|
6 |
/** Remove default margin. */
|
7 |
-
body { margin: 0; }
|
8 |
|
9 |
/* HTML5 display definitions ========================================================================== */
|
10 |
/** Correct `block` display not defined for any HTML5 element in IE 8/9. Correct `block` display not defined for `details` or `summary` in IE 10/11 and Firefox. Correct `block` display not defined for `main` in IE 11. */
|
@@ -275,9 +275,9 @@ a:hover { text-decoration: underline; }
|
|
275 |
@media screen and (max-width: 42em) { .page-header { padding: 2rem 1rem; } }
|
276 |
|
277 |
.project-name { margin-top: 0; margin-bottom: 0.1rem; }
|
278 |
-
@media screen and (min-width: 64em) { .project-name { font-size:
|
279 |
-
@media screen and (min-width: 42em) and (max-width: 64em) { .project-name { font-size:
|
280 |
-
@media screen and (max-width: 42em) { .project-name { font-size:
|
281 |
|
282 |
.project-tagline { margin-bottom: 2rem; font-weight: normal; opacity: 0.7; }
|
283 |
@media screen and (min-width: 64em) { .project-tagline { font-size: 1.25rem; } }
|
|
|
4 |
html { font-family: sans-serif; /* 1 */ -ms-text-size-adjust: 100%; /* 2 */ -webkit-text-size-adjust: 100%; /* 2 */ }
|
5 |
|
6 |
/** Remove default margin. */
|
7 |
+
body { margin: 0; font-family: sans-serif;}
|
8 |
|
9 |
/* HTML5 display definitions ========================================================================== */
|
10 |
/** Correct `block` display not defined for any HTML5 element in IE 8/9. Correct `block` display not defined for `details` or `summary` in IE 10/11 and Firefox. Correct `block` display not defined for `main` in IE 11. */
|
|
|
275 |
@media screen and (max-width: 42em) { .page-header { padding: 2rem 1rem; } }
|
276 |
|
277 |
.project-name { margin-top: 0; margin-bottom: 0.1rem; }
|
278 |
+
@media screen and (min-width: 64em) { .project-name { font-size: 4.25rem; } }
|
279 |
+
@media screen and (min-width: 42em) and (max-width: 64em) { .project-name { font-size: 3.25rem; } }
|
280 |
+
@media screen and (max-width: 42em) { .project-name { font-size: 2.75rem; } }
|
281 |
|
282 |
.project-tagline { margin-bottom: 2rem; font-weight: normal; opacity: 0.7; }
|
283 |
@media screen and (min-width: 64em) { .project-tagline { font-size: 1.25rem; } }
|
demo_results/normal_1.png
CHANGED
demo_results/normal_3.png
CHANGED
demo_results/normal_5.png
DELETED
Binary file (209 kB)
|
|
index.html
CHANGED
@@ -23,8 +23,8 @@
|
|
23 |
<link rel="stylesheet" href="assets/css/style.css?v=90447f115a006bc45b738d9592069468b20e2551">
|
24 |
<!-- start custom head snippets, customize with your own _includes/head-custom.html file -->
|
25 |
<link rel="stylesheet" href="assets/css/custom_style.css?v=90447f115a006bc45b738d9592069468b20e2551">
|
26 |
-
|
27 |
-
|
28 |
<link rel="stylesheet" href="https://ajax.googleapis.com/ajax/libs/jqueryui/1.12.1/themes/smoothness/jquery-ui.css">
|
29 |
<script src="https://ajax.googleapis.com/ajax/libs/jqueryui/1.12.1/jquery-ui.min.js"></script>
|
30 |
<script src="https://cdnjs.cloudflare.com/ajax/libs/Chart.js/2.9.4/Chart.js"></script>
|
@@ -32,133 +32,66 @@
|
|
32 |
<link rel="stylesheet" href="//code.jquery.com/ui/1.13.2/themes/base/jquery-ui.css">
|
33 |
<link rel="stylesheet" href="/resources/demos/style.css">
|
34 |
<script src="https://code.jquery.com/jquery-3.6.0.js"></script>
|
35 |
-
<script src="https://code.jquery.com/ui/1.13.2/jquery-ui.js"></script>
|
36 |
|
37 |
<!-- for mathjax support -->
|
38 |
<script src="https://cdnjs.cloudflare.com/polyfill/v3/polyfill.min.js?features=es6"></script>
|
39 |
<script id="MathJax-script" async src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>
|
40 |
<!-- end custom head snippets -->
|
41 |
|
42 |
-
|
43 |
-
|
44 |
-
let attackIndex = 0;
|
45 |
-
|
46 |
-
function navigateImages(type, direction) {
|
47 |
-
let images;
|
48 |
-
let currentIndex;
|
49 |
-
|
50 |
-
// Determine which set of images to target and the current index
|
51 |
-
if (type === 'normal') {
|
52 |
-
images = document.querySelectorAll('.normal-gallery .image-gallery img');
|
53 |
-
currentIndex = normalIndex;
|
54 |
-
} else if (type === 'attack') {
|
55 |
-
images = document.querySelectorAll('.attack-gallery .image-gallery img');
|
56 |
-
currentIndex = attackIndex;
|
57 |
-
}
|
58 |
-
|
59 |
-
// Check if images were found to avoid accessing undefined elements
|
60 |
-
if (images && images.length > 0) {
|
61 |
-
// Hide current image
|
62 |
-
images[currentIndex].classList.remove('active');
|
63 |
-
|
64 |
-
// Calculate new index (modulo ensures cycling)
|
65 |
-
currentIndex = (currentIndex + direction + images.length) % images.length;
|
66 |
-
|
67 |
-
// Show new image
|
68 |
-
images[currentIndex].classList.add('active');
|
69 |
-
|
70 |
-
// Update index tracker
|
71 |
-
if (type === 'normal') {
|
72 |
-
normalIndex = currentIndex;
|
73 |
-
} else if (type === 'attack') {
|
74 |
-
attackIndex = currentIndex;
|
75 |
-
}
|
76 |
-
} else {
|
77 |
-
console.error("No images found for type:", type);
|
78 |
-
}
|
79 |
-
}
|
80 |
-
</script>
|
81 |
-
|
82 |
-
<style>
|
83 |
-
.image-gallery-container {
|
84 |
-
position: relative;
|
85 |
-
display: flex;
|
86 |
-
justify-content: center;
|
87 |
-
align-items: center;
|
88 |
-
margin: 20px auto;
|
89 |
-
}
|
90 |
|
91 |
-
|
92 |
-
|
93 |
-
max-height: 600px; /* Increased height */
|
94 |
-
overflow: hidden;
|
95 |
-
position: relative;
|
96 |
-
margin-left: 50px;
|
97 |
-
margin-right: 50px;
|
98 |
-
}
|
99 |
-
|
100 |
-
.image-gallery img {
|
101 |
-
width: 100%;
|
102 |
-
height: auto;
|
103 |
-
border-radius: 10px;
|
104 |
-
display: none;
|
105 |
-
}
|
106 |
-
|
107 |
-
.image-gallery img.active {
|
108 |
-
display: block;
|
109 |
-
}
|
110 |
-
|
111 |
-
.arrow {
|
112 |
-
cursor: pointer;
|
113 |
-
position: absolute;
|
114 |
-
top: 50%;
|
115 |
-
transform: translateY(-50%);
|
116 |
-
font-size: 2.5em;
|
117 |
-
color: #444; /* Darker color */
|
118 |
-
padding: 5px 15px;
|
119 |
-
transition: color 0.3s;
|
120 |
-
user-select: none;
|
121 |
-
transform: scaleX(0.7);
|
122 |
-
}
|
123 |
-
|
124 |
-
.arrow:hover {
|
125 |
-
color: #007bff; /* Color on hover */
|
126 |
-
}
|
127 |
-
|
128 |
-
.left-arrow {
|
129 |
-
left: 20px; /* Adjusted position */
|
130 |
-
}
|
131 |
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
152 |
}
|
153 |
-
|
154 |
-
|
155 |
</head>
|
156 |
<body>
|
157 |
<header class="page-header" role="banner">
|
158 |
-
<h1 class="project-name">Attention Tracker</h1>
|
159 |
<h2 class="project-tagline">Attention Tracker: Detecting Prompt Injection Attacks in LLMs</h2>
|
160 |
-
<
|
161 |
-
<div style="text-align: center; font-size:
|
162 |
<div>
|
163 |
<a href="https://khhung906.github.io/" style="color: white;" target="_blank" rel="noopener noreferrer">
|
164 |
Kuo-Han Hung<sup>1</sup>,
|
@@ -182,12 +115,57 @@
|
|
182 |
</a>
|
183 |
</div>
|
184 |
|
185 |
-
<div>
|
186 |
<sup>1</sup>National Taiwan University <sup>2</sup>IBM Research
|
187 |
</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
188 |
</header>
|
|
|
|
|
189 |
<main id="content" class="main-content" role="main">
|
190 |
-
<h2 id="abstract">Abstract</h2>
|
191 |
|
192 |
<p>Large Language Models (LLMs) have revolutionized various domains but remain vulnerable to prompt injection attacks, where malicious inputs manipulate the model into ignoring original instructions and executing designated action. In this paper, we investigate
|
193 |
the underlying mechanisms of these attacks by analyzing the attention patterns within LLMs.
|
@@ -199,11 +177,11 @@
|
|
199 |
demonstrate the robustness of our approach through extensive evaluations and provide insights into safeguarding LLM-integrated systems from prompt injection vulnerabilities.
|
200 |
</p>
|
201 |
|
202 |
-
<h2 id="what-is-jailbreak">What is Prompt Injection Attack?</h2>
|
203 |
<p>A Prompt Injection Attack is a technique used to manipulate language models (like GPT-3 or similar AI systems) by injecting malicious or deceptive prompts into the input data, causing the model to behave in unexpected or undesired ways. This attack exploits the way language models interpret and respond to instructions, tricking them into providing information or performing actions that were not originally intended.</p>
|
204 |
|
205 |
|
206 |
-
<h2 id="refusal-loss">Distraction Effect</h2>
|
207 |
|
208 |
<p>
|
209 |
In this section, we analyze the reasons behind the success of prompt injection attacks on LLMs. Specifically, we aim to understand
|
@@ -223,7 +201,7 @@
|
|
223 |
|
224 |
</div>
|
225 |
|
226 |
-
<h2 id="proposed-approach-attention-tracker">Proposed Approach: Attention Tracker</h2>
|
227 |
<p> With the discover of distraction effect, we propose <strong>Attention Tracker</strong>,
|
228 |
a prompt injection detection method based on tracking the attention pattern on instruction. Our detection procedure is shown below:
|
229 |
</p>
|
@@ -245,7 +223,7 @@
|
|
245 |
We provide more details about the running flow of Attention Tracker in the paper.
|
246 |
</p>
|
247 |
|
248 |
-
<h2 id="result-attention-tracker">Experiment Result</h2>
|
249 |
<p>
|
250 |
In this section, we evaluate Attention Tracker against various baselines with the AUROC score on two prompt injection detection benchmarks: Open-Prompt-Injection and deepset prompt injection dataset:
|
251 |
</p>
|
@@ -254,7 +232,7 @@
|
|
254 |
<p>
|
255 |
As shown in the table, Attention Tracker consistently outperforms existing baselines, with an AUROC improvement of up to 3.1% on the Open-Prompt-Injection benchmark and 10.0% on the deepset prompt injection dataset. Among training-free methods, it achieves even greater gains, with an average AUROC improvement of 31.3% and 20.9% across the two datasets, respectively. Unlike LLM-based methods that rely on larger models for stability, Attention Tracker delivers robust and effective performance even with smaller LLMs, underscoring its suitability for real-world applications.
|
256 |
</p>
|
257 |
-
<h2 id="demonstration">Demo</h2>
|
258 |
|
259 |
<p>
|
260 |
We evaluated the effectiveness of the Attention Tracker by visualizing the distribution of attention aggregation for key heads across different data types (normal data vs. attack data) in the Open-Prompt-Injection dataset. Additionally, we calculated the focus score for these data samples. A higher focus score indicates a lower likelihood of prompt injection attacks. The tested model is Qwen-2 1.8b.
|
@@ -268,7 +246,6 @@
|
|
268 |
<img id="normalImage2" src="./demo_results/normal_2.png" alt="Normal Image 2">
|
269 |
<img id="normalImage3" src="./demo_results/normal_3.png" alt="Normal Image 3">
|
270 |
<img id="normalImage4" src="./demo_results/normal_4.png" alt="Normal Image 4">
|
271 |
-
<img id="normalImage4" src="./demo_results/normal_5.png" alt="Normal Image 5">
|
272 |
</div>
|
273 |
<span class="arrow right-arrow" onclick="navigateImages('normal', 1)">></span>
|
274 |
</div>
|
@@ -285,12 +262,12 @@
|
|
285 |
<span class="arrow right-arrow" onclick="navigateImages('attack', 1)">></span>
|
286 |
</div>
|
287 |
|
288 |
-
<h2 id="inquiries"> Inquiries on Attention Tracker</h2>
|
289 |
-
<p> Please contact <a href="Mailto:khhung906@gmail.com">Kuo-Han Hung</a>
|
290 |
and <a href="Mailto:pin-yu.chen@ibm.com">Pin-Yu Chen</a>
|
291 |
-
</p>
|
292 |
|
293 |
-
<h2 id="citations">Citations</h2>
|
294 |
<p>If you find Attention Tracker helpful and useful for your research, please cite our main paper as follows:</p>
|
295 |
|
296 |
<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>
|
|
|
23 |
<link rel="stylesheet" href="assets/css/style.css?v=90447f115a006bc45b738d9592069468b20e2551">
|
24 |
<!-- start custom head snippets, customize with your own _includes/head-custom.html file -->
|
25 |
<link rel="stylesheet" href="assets/css/custom_style.css?v=90447f115a006bc45b738d9592069468b20e2551">
|
26 |
+
<link rel="stylesheet" href="style.css">
|
27 |
+
<script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
|
28 |
<link rel="stylesheet" href="https://ajax.googleapis.com/ajax/libs/jqueryui/1.12.1/themes/smoothness/jquery-ui.css">
|
29 |
<script src="https://ajax.googleapis.com/ajax/libs/jqueryui/1.12.1/jquery-ui.min.js"></script>
|
30 |
<script src="https://cdnjs.cloudflare.com/ajax/libs/Chart.js/2.9.4/Chart.js"></script>
|
|
|
32 |
<link rel="stylesheet" href="//code.jquery.com/ui/1.13.2/themes/base/jquery-ui.css">
|
33 |
<link rel="stylesheet" href="/resources/demos/style.css">
|
34 |
<script src="https://code.jquery.com/jquery-3.6.0.js"></script>
|
35 |
+
<script src="https://code.jquery.com/ui/1.13.2/jquery-ui.js"></script>
|
36 |
|
37 |
<!-- for mathjax support -->
|
38 |
<script src="https://cdnjs.cloudflare.com/polyfill/v3/polyfill.min.js?features=es6"></script>
|
39 |
<script id="MathJax-script" async src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>
|
40 |
<!-- end custom head snippets -->
|
41 |
|
42 |
+
<!-- Font Awesome for PDF and GitHub icons -->
|
43 |
+
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.15.4/css/all.min.css">
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
|
45 |
+
<!-- AI2 HTML-CSS Icons (for arXiv) -->
|
46 |
+
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/academicons/1.9.1/css/academicons.min.css">
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
|
48 |
+
<script>
|
49 |
+
let normalIndex = 0;
|
50 |
+
let attackIndex = 0;
|
51 |
+
|
52 |
+
function navigateImages(type, direction) {
|
53 |
+
let images;
|
54 |
+
let currentIndex;
|
55 |
+
|
56 |
+
// Determine which set of images to target and the current index
|
57 |
+
if (type === 'normal') {
|
58 |
+
images = document.querySelectorAll('.normal-gallery .image-gallery img');
|
59 |
+
currentIndex = normalIndex;
|
60 |
+
} else if (type === 'attack') {
|
61 |
+
images = document.querySelectorAll('.attack-gallery .image-gallery img');
|
62 |
+
currentIndex = attackIndex;
|
63 |
+
}
|
64 |
+
|
65 |
+
// Check if images were found to avoid accessing undefined elements
|
66 |
+
if (images && images.length > 0) {
|
67 |
+
// Hide current image
|
68 |
+
images[currentIndex].classList.remove('active');
|
69 |
+
|
70 |
+
// Calculate new index (modulo ensures cycling)
|
71 |
+
currentIndex = (currentIndex + direction + images.length) % images.length;
|
72 |
+
|
73 |
+
// Show new image
|
74 |
+
images[currentIndex].classList.add('active');
|
75 |
+
|
76 |
+
// Update index tracker
|
77 |
+
if (type === 'normal') {
|
78 |
+
normalIndex = currentIndex;
|
79 |
+
} else if (type === 'attack') {
|
80 |
+
attackIndex = currentIndex;
|
81 |
+
}
|
82 |
+
} else {
|
83 |
+
console.error("No images found for type:", type);
|
84 |
+
}
|
85 |
}
|
86 |
+
</script>
|
87 |
+
|
88 |
</head>
|
89 |
<body>
|
90 |
<header class="page-header" role="banner">
|
91 |
+
<h1 class="project-name" style="font-weight: 500;">Attention Tracker</h1>
|
92 |
<h2 class="project-tagline">Attention Tracker: Detecting Prompt Injection Attacks in LLMs</h2>
|
93 |
+
<p/>
|
94 |
+
<div style="text-align: center; font-size:larger; ">
|
95 |
<div>
|
96 |
<a href="https://khhung906.github.io/" style="color: white;" target="_blank" rel="noopener noreferrer">
|
97 |
Kuo-Han Hung<sup>1</sup>,
|
|
|
115 |
</a>
|
116 |
</div>
|
117 |
|
118 |
+
<div style="color: #f1f0f0">
|
119 |
<sup>1</sup>National Taiwan University <sup>2</sup>IBM Research
|
120 |
</div>
|
121 |
+
|
122 |
+
<div class="publication-links">
|
123 |
+
<span class="link-block">
|
124 |
+
<a href="https://arxiv.org/pdf/<ARXIV PAPER ID>.pdf" target="_blank"
|
125 |
+
class="external-link button is-normal is-rounded is-dark">
|
126 |
+
<span class="icon">
|
127 |
+
<i class="fas fa-file-pdf"></i>
|
128 |
+
</span>
|
129 |
+
<span>Paper</span>
|
130 |
+
</a>
|
131 |
+
</span>
|
132 |
+
|
133 |
+
<span class="link-block">
|
134 |
+
<a href="https://arxiv.org/abs/<ARXIV PAPER ID>" target="_blank"
|
135 |
+
class="external-link button is-normal is-rounded is-dark">
|
136 |
+
<span class="icon">
|
137 |
+
<i class="ai ai-arxiv"></i>
|
138 |
+
</span>
|
139 |
+
<span>arXiv</span>
|
140 |
+
</a>
|
141 |
+
</span>
|
142 |
+
|
143 |
+
<span class="link-block">
|
144 |
+
<a href="https://github.com/YOUR REPO HERE" target="_blank"
|
145 |
+
class="external-link button is-normal is-rounded is-dark">
|
146 |
+
<span class="icon">
|
147 |
+
<i class="fab fa-github"></i>
|
148 |
+
</span>
|
149 |
+
<span>Code</span>
|
150 |
+
</a>
|
151 |
+
</span>
|
152 |
+
|
153 |
+
<span class="link-block">
|
154 |
+
<a href="https://dataset-link.com" target="_blank"
|
155 |
+
class="external-link button is-normal is-rounded is-dark">
|
156 |
+
<span class="icon">
|
157 |
+
<i class="fas fa-laptop"></i>
|
158 |
+
</span>
|
159 |
+
<span>Demo</span>
|
160 |
+
</a>
|
161 |
+
</span>
|
162 |
+
</div>
|
163 |
+
|
164 |
</header>
|
165 |
+
|
166 |
+
|
167 |
<main id="content" class="main-content" role="main">
|
168 |
+
<h2 id="abstract" class="section-title">Abstract</h2>
|
169 |
|
170 |
<p>Large Language Models (LLMs) have revolutionized various domains but remain vulnerable to prompt injection attacks, where malicious inputs manipulate the model into ignoring original instructions and executing designated action. In this paper, we investigate
|
171 |
the underlying mechanisms of these attacks by analyzing the attention patterns within LLMs.
|
|
|
177 |
demonstrate the robustness of our approach through extensive evaluations and provide insights into safeguarding LLM-integrated systems from prompt injection vulnerabilities.
|
178 |
</p>
|
179 |
|
180 |
+
<h2 id="what-is-jailbreak" class="section-title">What is Prompt Injection Attack?</h2>
|
181 |
<p>A Prompt Injection Attack is a technique used to manipulate language models (like GPT-3 or similar AI systems) by injecting malicious or deceptive prompts into the input data, causing the model to behave in unexpected or undesired ways. This attack exploits the way language models interpret and respond to instructions, tricking them into providing information or performing actions that were not originally intended.</p>
|
182 |
|
183 |
|
184 |
+
<h2 id="refusal-loss" class="section-title">Distraction Effect</h2>
|
185 |
|
186 |
<p>
|
187 |
In this section, we analyze the reasons behind the success of prompt injection attacks on LLMs. Specifically, we aim to understand
|
|
|
201 |
|
202 |
</div>
|
203 |
|
204 |
+
<h2 id="proposed-approach-attention-tracker" class="section-title">Proposed Approach: Attention Tracker</h2>
|
205 |
<p> With the discover of distraction effect, we propose <strong>Attention Tracker</strong>,
|
206 |
a prompt injection detection method based on tracking the attention pattern on instruction. Our detection procedure is shown below:
|
207 |
</p>
|
|
|
223 |
We provide more details about the running flow of Attention Tracker in the paper.
|
224 |
</p>
|
225 |
|
226 |
+
<h2 id="result-attention-tracker" class="section-title">Experiment Result</h2>
|
227 |
<p>
|
228 |
In this section, we evaluate Attention Tracker against various baselines with the AUROC score on two prompt injection detection benchmarks: Open-Prompt-Injection and deepset prompt injection dataset:
|
229 |
</p>
|
|
|
232 |
<p>
|
233 |
As shown in the table, Attention Tracker consistently outperforms existing baselines, with an AUROC improvement of up to 3.1% on the Open-Prompt-Injection benchmark and 10.0% on the deepset prompt injection dataset. Among training-free methods, it achieves even greater gains, with an average AUROC improvement of 31.3% and 20.9% across the two datasets, respectively. Unlike LLM-based methods that rely on larger models for stability, Attention Tracker delivers robust and effective performance even with smaller LLMs, underscoring its suitability for real-world applications.
|
234 |
</p>
|
235 |
+
<h2 id="demonstration" class="section-title">Demo</h2>
|
236 |
|
237 |
<p>
|
238 |
We evaluated the effectiveness of the Attention Tracker by visualizing the distribution of attention aggregation for key heads across different data types (normal data vs. attack data) in the Open-Prompt-Injection dataset. Additionally, we calculated the focus score for these data samples. A higher focus score indicates a lower likelihood of prompt injection attacks. The tested model is Qwen-2 1.8b.
|
|
|
246 |
<img id="normalImage2" src="./demo_results/normal_2.png" alt="Normal Image 2">
|
247 |
<img id="normalImage3" src="./demo_results/normal_3.png" alt="Normal Image 3">
|
248 |
<img id="normalImage4" src="./demo_results/normal_4.png" alt="Normal Image 4">
|
|
|
249 |
</div>
|
250 |
<span class="arrow right-arrow" onclick="navigateImages('normal', 1)">></span>
|
251 |
</div>
|
|
|
262 |
<span class="arrow right-arrow" onclick="navigateImages('attack', 1)">></span>
|
263 |
</div>
|
264 |
|
265 |
+
<!-- <h2 id="inquiries" class="section-title"> Inquiries on Attention Tracker</h2>
|
266 |
+
<p class="section-title"> Please contact <a href="Mailto:khhung906@gmail.com">Kuo-Han Hung</a>
|
267 |
and <a href="Mailto:pin-yu.chen@ibm.com">Pin-Yu Chen</a>
|
268 |
+
</p> -->
|
269 |
|
270 |
+
<h2 id="citations" class="section-title">Citations</h2>
|
271 |
<p>If you find Attention Tracker helpful and useful for your research, please cite our main paper as follows:</p>
|
272 |
|
273 |
<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>
|
style.css
CHANGED
@@ -1,86 +1,112 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
|
|
|
|
|
|
4 |
}
|
5 |
|
6 |
-
|
7 |
-
|
8 |
-
|
|
|
|
|
|
|
|
|
9 |
}
|
10 |
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
}
|
17 |
|
18 |
-
.
|
19 |
-
|
20 |
-
margin: 0 auto;
|
21 |
-
padding: 16px;
|
22 |
-
border: 1px solid lightgray;
|
23 |
-
border-radius: 16px;
|
24 |
}
|
25 |
|
26 |
-
.
|
27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
}
|
29 |
|
30 |
-
.
|
31 |
-
|
32 |
-
justify-content: center;
|
33 |
-
gap: 20px; /* Space between images */
|
34 |
-
margin-top: 20px; /* Add space above the image gallery */
|
35 |
}
|
36 |
|
37 |
-
.
|
38 |
-
|
39 |
-
max-width: 500px; /* Ensure maximum width remains 500px */
|
40 |
-
height: auto; /* Maintain aspect ratio */
|
41 |
-
border-radius: 10px; /* Rounded corners for images */
|
42 |
-
box-shadow: 0 4px 10px rgba(0, 0, 0, 0.1); /* Add shadow to images */
|
43 |
-
display: none; /* Initially hide all images */
|
44 |
}
|
45 |
|
46 |
-
.
|
47 |
-
|
48 |
-
margin-top: 20px;
|
49 |
}
|
50 |
|
51 |
-
.
|
|
|
52 |
font-weight: bold;
|
53 |
-
margin:
|
54 |
font-size: 1.2em;
|
|
|
|
|
|
|
55 |
}
|
56 |
|
57 |
-
.
|
58 |
-
|
59 |
}
|
60 |
|
61 |
-
.
|
62 |
-
|
63 |
}
|
64 |
|
65 |
-
.
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
}
|
75 |
|
76 |
-
.button
|
77 |
-
|
|
|
78 |
}
|
79 |
|
80 |
-
.
|
81 |
-
|
|
|
82 |
}
|
83 |
|
84 |
-
|
85 |
-
|
|
|
86 |
}
|
|
|
1 |
+
.image-gallery-container {
|
2 |
+
position: relative;
|
3 |
+
display: flex;
|
4 |
+
justify-content: center;
|
5 |
+
align-items: center;
|
6 |
+
margin: 20px auto;
|
7 |
}
|
8 |
|
9 |
+
.image-gallery {
|
10 |
+
width: 80%; /* Increased width */
|
11 |
+
max-height: 600px; /* Increased height */
|
12 |
+
overflow: hidden;
|
13 |
+
position: relative;
|
14 |
+
margin-left: 50px;
|
15 |
+
margin-right: 50px;
|
16 |
}
|
17 |
|
18 |
+
.image-gallery img {
|
19 |
+
width: 100%;
|
20 |
+
height: auto;
|
21 |
+
border-radius: 10px;
|
22 |
+
display: none;
|
23 |
}
|
24 |
|
25 |
+
.image-gallery img.active {
|
26 |
+
display: block;
|
|
|
|
|
|
|
|
|
27 |
}
|
28 |
|
29 |
+
.arrow {
|
30 |
+
cursor: pointer;
|
31 |
+
position: absolute;
|
32 |
+
top: 50%;
|
33 |
+
transform: translateY(-50%);
|
34 |
+
font-size: 2.5em;
|
35 |
+
color: #444; /* Darker color */
|
36 |
+
padding: 5px 15px;
|
37 |
+
transition: color 0.3s;
|
38 |
+
user-select: none;
|
39 |
+
transform: scaleX(0.7);
|
40 |
}
|
41 |
|
42 |
+
.arrow:hover {
|
43 |
+
color: #007bff; /* Color on hover */
|
|
|
|
|
|
|
44 |
}
|
45 |
|
46 |
+
.left-arrow {
|
47 |
+
left: 20px; /* Adjusted position */
|
|
|
|
|
|
|
|
|
|
|
48 |
}
|
49 |
|
50 |
+
.right-arrow {
|
51 |
+
right: 20px; /* Adjusted position */
|
|
|
52 |
}
|
53 |
|
54 |
+
.group-title {
|
55 |
+
font-size: large;
|
56 |
font-weight: bold;
|
57 |
+
margin-top: 30px;
|
58 |
font-size: 1.2em;
|
59 |
+
text-align: center; /* Center the text */
|
60 |
+
display: block; /* Ensure it behaves as a block element */
|
61 |
+
margin-bottom: 10px; /* Add some space below the title */
|
62 |
}
|
63 |
|
64 |
+
.green {
|
65 |
+
color: green; /* Set text color to green */
|
66 |
}
|
67 |
|
68 |
+
.red {
|
69 |
+
color: red; /* Set text color to green */
|
70 |
}
|
71 |
|
72 |
+
.section-title {
|
73 |
+
text-align: center;
|
74 |
+
}
|
75 |
+
|
76 |
+
|
77 |
+
.publication-links {
|
78 |
+
text-align: center;
|
79 |
+
margin-top: 20px;
|
80 |
+
}
|
81 |
+
|
82 |
+
.link-block {
|
83 |
+
display: inline-block;
|
84 |
+
margin: 8px; /* Increase spacing between buttons */
|
85 |
+
}
|
86 |
+
|
87 |
+
.button {
|
88 |
+
display: inline-flex;
|
89 |
+
align-items: center;
|
90 |
+
padding: 10px 18px; /* Larger padding for bigger buttons */
|
91 |
+
text-decoration: none;
|
92 |
+
color: black; /* Fixed text color */
|
93 |
+
background-color: rgb(236, 236, 236); /* Dark background color */
|
94 |
+
border-radius: 25px; /* Rounded corners */
|
95 |
+
font-size: 18px; /* Larger font size */
|
96 |
+
transition: background-color 0.3s ease;
|
97 |
}
|
98 |
|
99 |
+
.button:hover {
|
100 |
+
background-color: #495057; /* Slightly lighter on hover */
|
101 |
+
color: white; /* Ensure text color remains white on hover */
|
102 |
}
|
103 |
|
104 |
+
.icon {
|
105 |
+
margin-right: 8px; /* Adjust spacing between icon and text */
|
106 |
+
font-size: 20px; /* Larger icon size */
|
107 |
}
|
108 |
|
109 |
+
/* Adjust font size for icons */
|
110 |
+
.fas, .fab, .ai {
|
111 |
+
font-size: 20px; /* Ensuring all icons match the new size */
|
112 |
}
|