Spaces:
Running
Running
johnsonhung906
commited on
Commit
•
a8cdd89
1
Parent(s):
88e1260
add intro image & demo image changes
Browse files- .DS_Store +0 -0
- assets/.DS_Store +0 -0
- demo_results/attack_5.png +0 -0
- demo_results/normal_5.png +0 -0
- figures/attack_intro.png +0 -0
- index.html +62 -48
- style.css +8 -0
.DS_Store
DELETED
Binary file (6.15 kB)
|
|
assets/.DS_Store
CHANGED
Binary files a/assets/.DS_Store and b/assets/.DS_Store differ
|
|
demo_results/attack_5.png
ADDED
demo_results/normal_5.png
ADDED
figures/attack_intro.png
ADDED
index.html
CHANGED
@@ -48,41 +48,53 @@
|
|
48 |
<script>
|
49 |
let normalIndex = 0;
|
50 |
let attackIndex = 0;
|
51 |
-
|
52 |
function navigateImages(type, direction) {
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
if (type === 'normal') {
|
58 |
-
|
59 |
-
currentIndex = normalIndex;
|
60 |
} else if (type === 'attack') {
|
61 |
-
|
62 |
-
currentIndex = attackIndex;
|
63 |
-
}
|
64 |
-
|
65 |
-
// Check if images were found to avoid accessing undefined elements
|
66 |
-
if (images && images.length > 0) {
|
67 |
-
// Hide current image
|
68 |
-
images[currentIndex].classList.remove('active');
|
69 |
-
|
70 |
-
// Calculate new index (modulo ensures cycling)
|
71 |
-
currentIndex = (currentIndex + direction + images.length) % images.length;
|
72 |
-
|
73 |
-
// Show new image
|
74 |
-
images[currentIndex].classList.add('active');
|
75 |
-
|
76 |
-
// Update index tracker
|
77 |
-
if (type === 'normal') {
|
78 |
-
normalIndex = currentIndex;
|
79 |
-
} else if (type === 'attack') {
|
80 |
-
attackIndex = currentIndex;
|
81 |
-
}
|
82 |
-
} else {
|
83 |
-
console.error("No images found for type:", type);
|
84 |
}
|
|
|
|
|
|
|
85 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
</script>
|
87 |
|
88 |
</head>
|
@@ -151,14 +163,13 @@
|
|
151 |
</span>
|
152 |
|
153 |
<span class="link-block">
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
</span>
|
162 |
</div>
|
163 |
|
164 |
</header>
|
@@ -180,6 +191,7 @@
|
|
180 |
<h2 id="what-is-jailbreak" class="section-title">What is Prompt Injection Attack?</h2>
|
181 |
<p>A Prompt Injection Attack is a technique used to manipulate language models (like GPT-3 or similar AI systems) by injecting malicious or deceptive prompts into the input data, causing the model to behave in unexpected or undesired ways. This attack exploits the way language models interpret and respond to instructions, tricking them into providing information or performing actions that were not originally intended.</p>
|
182 |
|
|
|
183 |
|
184 |
<h2 id="refusal-loss" class="section-title">Distraction Effect</h2>
|
185 |
|
@@ -232,7 +244,7 @@
|
|
232 |
<p>
|
233 |
As shown in the table, Attention Tracker consistently outperforms existing baselines, with an AUROC improvement of up to 3.1% on the Open-Prompt-Injection benchmark and 10.0% on the deepset prompt injection dataset. Among training-free methods, it achieves even greater gains, with an average AUROC improvement of 31.3% and 20.9% across the two datasets, respectively. Unlike LLM-based methods that rely on larger models for stability, Attention Tracker delivers robust and effective performance even with smaller LLMs, underscoring its suitability for real-world applications.
|
234 |
</p>
|
235 |
-
<h2 id="
|
236 |
|
237 |
<p>
|
238 |
We evaluated the effectiveness of the Attention Tracker by visualizing the distribution of attention aggregation for key heads across different data types (normal data vs. attack data) in the Open-Prompt-Injection dataset. Additionally, we calculated the focus score for these data samples. A higher focus score indicates a lower likelihood of prompt injection attacks. The tested model is Qwen-2 1.8b.
|
@@ -240,24 +252,26 @@
|
|
240 |
<div class="group-title green">Normal Data</div>
|
241 |
|
242 |
<div class="image-gallery-container normal-gallery">
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
|
|
251 |
</div>
|
252 |
|
253 |
<div class="group-title red">Attack Data</div>
|
254 |
<div class="image-gallery-container attack-gallery">
|
255 |
<span class="arrow left-arrow" onclick="navigateImages('attack', -1)"><</span>
|
256 |
<div class="image-gallery">
|
257 |
-
<img id="attackImage1" src="./demo_results/attack_1.png"
|
258 |
<img id="attackImage2" src="./demo_results/attack_2.png" alt="Attack Image 2">
|
259 |
<img id="attackImage3" src="./demo_results/attack_3.png" alt="Attack Image 3">
|
260 |
<img id="attackImage4" src="./demo_results/attack_4.png" alt="Attack Image 4">
|
|
|
261 |
</div>
|
262 |
<span class="arrow right-arrow" onclick="navigateImages('attack', 1)">></span>
|
263 |
</div>
|
|
|
48 |
<script>
|
49 |
let normalIndex = 0;
|
50 |
let attackIndex = 0;
|
51 |
+
|
52 |
function navigateImages(type, direction) {
|
53 |
+
let images;
|
54 |
+
let currentIndex;
|
55 |
+
|
56 |
+
if (type === 'normal') {
|
57 |
+
images = document.querySelectorAll('.normal-gallery .image-gallery img');
|
58 |
+
currentIndex = normalIndex;
|
59 |
+
} else if (type === 'attack') {
|
60 |
+
images = document.querySelectorAll('.attack-gallery .image-gallery img');
|
61 |
+
currentIndex = attackIndex;
|
62 |
+
}
|
63 |
+
|
64 |
+
if (images && images.length > 0) {
|
65 |
+
// Remove the active class from the current image
|
66 |
+
images[currentIndex].classList.remove('active');
|
67 |
+
|
68 |
+
// Update the current index based on direction and number of images
|
69 |
+
currentIndex = (currentIndex + direction + images.length) % images.length;
|
70 |
+
|
71 |
+
// Add the active class to the new image
|
72 |
+
images[currentIndex].classList.add('active');
|
73 |
+
|
74 |
+
// Save the updated index
|
75 |
if (type === 'normal') {
|
76 |
+
normalIndex = currentIndex;
|
|
|
77 |
} else if (type === 'attack') {
|
78 |
+
attackIndex = currentIndex;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
}
|
80 |
+
} else {
|
81 |
+
console.error("No images found for type:", type);
|
82 |
+
}
|
83 |
}
|
84 |
+
|
85 |
+
// Initialize the galleries by adding the active class to the first image
|
86 |
+
document.addEventListener("DOMContentLoaded", () => {
|
87 |
+
const normalImages = document.querySelectorAll('.normal-gallery .image-gallery img');
|
88 |
+
const attackImages = document.querySelectorAll('.attack-gallery .image-gallery img');
|
89 |
+
|
90 |
+
if (normalImages.length > 0) {
|
91 |
+
normalImages[0].classList.add('active');
|
92 |
+
}
|
93 |
+
|
94 |
+
if (attackImages.length > 0) {
|
95 |
+
attackImages[0].classList.add('active');
|
96 |
+
}
|
97 |
+
});
|
98 |
</script>
|
99 |
|
100 |
</head>
|
|
|
163 |
</span>
|
164 |
|
165 |
<span class="link-block">
|
166 |
+
<a href="#demo" class="external-link button is-normal is-rounded is-dark">
|
167 |
+
<span class="icon">
|
168 |
+
<i class="fas fa-laptop"></i>
|
169 |
+
</span>
|
170 |
+
<span>Demo</span>
|
171 |
+
</a>
|
172 |
+
</span>
|
|
|
173 |
</div>
|
174 |
|
175 |
</header>
|
|
|
191 |
<h2 id="what-is-jailbreak" class="section-title">What is Prompt Injection Attack?</h2>
|
192 |
<p>A Prompt Injection Attack is a technique used to manipulate language models (like GPT-3 or similar AI systems) by injecting malicious or deceptive prompts into the input data, causing the model to behave in unexpected or undesired ways. This attack exploits the way language models interpret and respond to instructions, tricking them into providing information or performing actions that were not originally intended.</p>
|
193 |
|
194 |
+
<div><img id="attack-intro" src="./figures/attack_intro.png" /></div>
|
195 |
|
196 |
<h2 id="refusal-loss" class="section-title">Distraction Effect</h2>
|
197 |
|
|
|
244 |
<p>
|
245 |
As shown in the table, Attention Tracker consistently outperforms existing baselines, with an AUROC improvement of up to 3.1% on the Open-Prompt-Injection benchmark and 10.0% on the deepset prompt injection dataset. Among training-free methods, it achieves even greater gains, with an average AUROC improvement of 31.3% and 20.9% across the two datasets, respectively. Unlike LLM-based methods that rely on larger models for stability, Attention Tracker delivers robust and effective performance even with smaller LLMs, underscoring its suitability for real-world applications.
|
246 |
</p>
|
247 |
+
<h2 id="demo" class="section-title">Demo</h2>
|
248 |
|
249 |
<p>
|
250 |
We evaluated the effectiveness of the Attention Tracker by visualizing the distribution of attention aggregation for key heads across different data types (normal data vs. attack data) in the Open-Prompt-Injection dataset. Additionally, we calculated the focus score for these data samples. A higher focus score indicates a lower likelihood of prompt injection attacks. The tested model is Qwen-2 1.8b.
|
|
|
252 |
<div class="group-title green">Normal Data</div>
|
253 |
|
254 |
<div class="image-gallery-container normal-gallery">
|
255 |
+
<span class="arrow left-arrow" onclick="navigateImages('normal', -1)"><</span>
|
256 |
+
<div class="image-gallery">
|
257 |
+
<!-- <img id="normalImage1" src="./demo_results/normal_1.png" alt="Normal Image 1"> -->
|
258 |
+
<img id="normalImage2" src="./demo_results/normal_2.png" alt="Normal Image 2">
|
259 |
+
<img id="normalImage3" src="./demo_results/normal_3.png" alt="Normal Image 3">
|
260 |
+
<img id="normalImage4" src="./demo_results/normal_4.png" alt="Normal Image 4">
|
261 |
+
<img id="normalImage5" src="./demo_results/normal_5.png" alt="Normal Image 5">
|
262 |
+
</div>
|
263 |
+
<span class="arrow right-arrow" onclick="navigateImages('normal', 1)">></span>
|
264 |
</div>
|
265 |
|
266 |
<div class="group-title red">Attack Data</div>
|
267 |
<div class="image-gallery-container attack-gallery">
|
268 |
<span class="arrow left-arrow" onclick="navigateImages('attack', -1)"><</span>
|
269 |
<div class="image-gallery">
|
270 |
+
<!-- <img id="attackImage1" src="./demo_results/attack_1.png" alt="Attack Image 1" class="active"> -->
|
271 |
<img id="attackImage2" src="./demo_results/attack_2.png" alt="Attack Image 2">
|
272 |
<img id="attackImage3" src="./demo_results/attack_3.png" alt="Attack Image 3">
|
273 |
<img id="attackImage4" src="./demo_results/attack_4.png" alt="Attack Image 4">
|
274 |
+
<img id="attackImage5" src="./demo_results/attack_5.png" alt="Attack Image 5">
|
275 |
</div>
|
276 |
<span class="arrow right-arrow" onclick="navigateImages('attack', 1)">></span>
|
277 |
</div>
|
style.css
CHANGED
@@ -109,4 +109,12 @@
|
|
109 |
/* Adjust font size for icons */
|
110 |
.fas, .fab, .ai {
|
111 |
font-size: 20px; /* Ensuring all icons match the new size */
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
112 |
}
|
|
|
109 |
/* Adjust font size for icons */
|
110 |
.fas, .fab, .ai {
|
111 |
font-size: 20px; /* Ensuring all icons match the new size */
|
112 |
+
}
|
113 |
+
|
114 |
+
.image-gallery img {
|
115 |
+
display: none; /* Hide all images by default */
|
116 |
+
}
|
117 |
+
|
118 |
+
.image-gallery img.active {
|
119 |
+
display: block; /* Show only the image with the active class */
|
120 |
}
|