Spaces:
Running
Running
<html lang="en-US"> | |
<head> | |
<meta charset="UTF-8"> | |
<!-- Begin Jekyll SEO tag v2.8.0 --> | |
<title>Attention Tracker | Attention Tracker: Detecting Prompt Injection Attacks in LLMs </title> | |
<meta property="og:title" content="Gradient Cuff" /> | |
<meta property="og:locale" content="en_US" /> | |
<meta name="description" content="Detecting Prompt Injection Attacks in LLMs using attention" /> | |
<meta property="og:description" content="Detecting Prompt Injection Attacks in LLMs using attention" /> | |
<script type="application/ld+json"> | |
{"@context":"https://schema.org","@type":"WebSite","description":"Detecting Jailbreak Attacks on Large Language Models by Exploring Refusal Loss Landscapes","headline":"Gradient Cuff","name":"Gradient Cuff","url":"https://huggingface.co/spaces/gregH/Gradient Cuff"}</script> | |
<!-- End Jekyll SEO tag --> | |
<!-- <link rel="preconnect" href="https://fonts.gstatic.com"> | |
<link rel="preload" href="https://fonts.googleapis.com/css?family=Open+Sans:400,700&display=swap" as="style" type="text/css" crossorigin> | |
<meta name="viewport" content="width=device-width, initial-scale=1"> | |
<meta name="theme-color" content="#157878"> | |
<meta name="apple-mobile-web-app-status-bar-style" content="black-translucent"> --> | |
<link rel="stylesheet" href="assets/css/bootstrap/bootstrap.min.css?v=90447f115a006bc45b738d9592069468b20e2551"> | |
<link rel="stylesheet" href="assets/css/style.css?v=90447f115a006bc45b738d9592069468b20e2551"> | |
<!-- start custom head snippets, customize with your own _includes/head-custom.html file --> | |
<link rel="stylesheet" href="assets/css/custom_style.css?v=90447f115a006bc45b738d9592069468b20e2551"> | |
<link rel="stylesheet" href="style.css"> | |
<script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script> | |
<link rel="stylesheet" href="https://ajax.googleapis.com/ajax/libs/jqueryui/1.12.1/themes/smoothness/jquery-ui.css"> | |
<script src="https://ajax.googleapis.com/ajax/libs/jqueryui/1.12.1/jquery-ui.min.js"></script> | |
<script src="https://cdnjs.cloudflare.com/ajax/libs/Chart.js/2.9.4/Chart.js"></script> | |
<script src="assets/js/calibration.js?v=90447f115a006bc45b738d9592069468b20e2551"></script> | |
<link rel="stylesheet" href="//code.jquery.com/ui/1.13.2/themes/base/jquery-ui.css"> | |
<link rel="stylesheet" href="/resources/demos/style.css"> | |
<script src="https://code.jquery.com/jquery-3.6.0.js"></script> | |
<script src="https://code.jquery.com/ui/1.13.2/jquery-ui.js"></script> | |
<!-- for mathjax support --> | |
<script src="https://cdnjs.cloudflare.com/polyfill/v3/polyfill.min.js?features=es6"></script> | |
<script id="MathJax-script" async src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script> | |
<!-- end custom head snippets --> | |
<!-- Font Awesome for PDF and GitHub icons --> | |
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.15.4/css/all.min.css"> | |
<!-- AI2 HTML-CSS Icons (for arXiv) --> | |
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/academicons/1.9.1/css/academicons.min.css"> | |
<script> | |
let normalIndex = 0; | |
let attackIndex = 0; | |
function navigateImages(type, direction) { | |
let images; | |
let currentIndex; | |
if (type === 'normal') { | |
images = document.querySelectorAll('.normal-gallery .image-gallery img'); | |
currentIndex = normalIndex; | |
} else if (type === 'attack') { | |
images = document.querySelectorAll('.attack-gallery .image-gallery img'); | |
currentIndex = attackIndex; | |
} | |
if (images && images.length > 0) { | |
// Remove the active class from the current image | |
images[currentIndex].classList.remove('active'); | |
// Update the current index based on direction and number of images | |
currentIndex = (currentIndex + direction + images.length) % images.length; | |
// Add the active class to the new image | |
images[currentIndex].classList.add('active'); | |
// Save the updated index | |
if (type === 'normal') { | |
normalIndex = currentIndex; | |
} else if (type === 'attack') { | |
attackIndex = currentIndex; | |
} | |
} else { | |
console.error("No images found for type:", type); | |
} | |
} | |
// Initialize the galleries by adding the active class to the first image | |
document.addEventListener("DOMContentLoaded", () => { | |
const normalImages = document.querySelectorAll('.normal-gallery .image-gallery img'); | |
const attackImages = document.querySelectorAll('.attack-gallery .image-gallery img'); | |
if (normalImages.length > 0) { | |
normalImages[0].classList.add('active'); | |
} | |
if (attackImages.length > 0) { | |
attackImages[0].classList.add('active'); | |
} | |
}); | |
</script> | |
</head> | |
<body> | |
<header class="page-header" role="banner"> | |
<h1 class="project-name" style="font-weight: 500;">Attention Tracker</h1> | |
<h2 class="project-tagline">Attention Tracker: Detecting Prompt Injection Attacks in LLMs</h2> | |
<p /> | |
<div style="text-align: center; font-size:larger; "> | |
<div> | |
<a href="https://khhung906.github.io/" style="color: white;" target="_blank" rel="noopener noreferrer"> | |
Kuo-Han Hung<sup>1,2</sup>, | |
</a> | |
<a href="https://ireneko.github.io/" style="color: white;" target="_blank" rel="noopener noreferrer"> | |
Ching-Yun Ko<sup>1</sup>, | |
</a> | |
<a href="" style="color: white;" target="_blank" rel="noopener noreferrer"> | |
Ambrish Rawat<sup>1</sup>, | |
</a> | |
</div> | |
<div> | |
<a href="" style="color: white;" target="_blank" rel="noopener noreferrer"> | |
I-Hsin Chung<sup>1</sup>, | |
</a> | |
<a href="https://winstonhsu.info/" style="color: white;" target="_blank" rel="noopener noreferrer"> | |
Winston H. Hsu<sup>2</sup>, | |
</a> | |
<a href="https://sites.google.com/site/pinyuchenpage/" style="color: white;" target="_blank" | |
rel="noopener noreferrer"> | |
Pin-Yu Chen<sup>1</sup> | |
</a> | |
</div> | |
<div style="color: #f1f0f0"> | |
<sup>1</sup>IBM Research <sup>2</sup>National Taiwan University | |
</div> | |
<div class="publication-links"> | |
<span class="link-block"> | |
<a href="https://arxiv.org/pdf/2411.00348.pdf" target="_blank" | |
class="external-link button is-normal is-rounded is-dark"> | |
<span class="icon"> | |
<i class="fas fa-file-pdf"></i> | |
</span> | |
<span>Paper</span> | |
</a> | |
</span> | |
<span class="link-block"> | |
<a href="https://arxiv.org/abs/2411.00348" target="_blank" | |
class="external-link button is-normal is-rounded is-dark"> | |
<span class="icon"> | |
<i class="ai ai-arxiv"></i> | |
</span> | |
<span>arXiv</span> | |
</a> | |
</span> | |
<span class="link-block"> | |
<a href="https://github.com/YOUR REPO HERE" target="_blank" | |
class="external-link button is-normal is-rounded is-dark"> | |
<span class="icon"> | |
<i class="fab fa-github"></i> | |
</span> | |
<span>Code</span> | |
</a> | |
</span> | |
<span class="link-block"> | |
<a href="https://huggingface.co/spaces/pinyuchen/attention-tracker" target="_blank" | |
class="external-link button is-normal is-rounded is-dark"> | |
<span class="icon"> | |
<i class="fas fa-laptop"></i> | |
</span> | |
<span>Demo</span> | |
</a> | |
</span> | |
</div> | |
</header> | |
<main id="content" class="main-content" role="main"> | |
<h2 id="abstract" class="section-title">Abstract</h2> | |
<p>Large Language Models (LLMs) have revolutionized various domains but remain vulnerable to prompt injection | |
attacks, where malicious inputs manipulate the model into ignoring original instructions and executing designated | |
action. In this paper, we investigate | |
the underlying mechanisms of these attacks by analyzing the attention patterns within LLMs. | |
We introduce the concept of the <strong>distraction effect</strong>, where specific attention heads, termed | |
important heads, shift focus from the original instruction to the injected instruction. Building on this | |
discovery, we propose <strong>Attention | |
Tracker</strong>, a training-free detection method that tracks attention patterns on instruction to detect | |
prompt injection attacks without the need for additional LLM inference. Our method generalizes effectively across | |
diverse models, datasets, | |
and attack types, showing an AUROC improvement of up to 10.0% over existing methods, and performs well even on | |
small LLMs. We | |
demonstrate the robustness of our approach through extensive evaluations and provide insights into safeguarding | |
LLM-integrated systems from prompt injection vulnerabilities. | |
</p> | |
<h2 id="what-is-jailbreak" class="section-title">What is Prompt Injection Attack?</h2> | |
<p>A Prompt Injection Attack is a technique used to manipulate language models (like GPT-3 or similar AI systems) by | |
injecting malicious or deceptive prompts into the input data, causing the model to behave in unexpected or | |
undesired ways. This attack exploits the way language models interpret and respond to instructions, tricking them | |
into providing information or performing actions that were not originally intended.</p> | |
<div><img id="attack-intro" src="./figures/attack_intro.png" /></div> | |
<h2 id="refusal-loss" class="section-title">Distraction Effect</h2> | |
<p> | |
In this section, we analyze the reasons behind the success of prompt injection attacks on LLMs. Specifically, we | |
aim to understand | |
<strong>what mechanism within LLMs causes them to "ignore" the original instruction and follow the injected | |
instruction instead</strong>. | |
To explore this, we examine the attention patterns of the last token in the input prompts, as it has the most | |
direct influence on the LLMs' output. | |
</p> | |
<div class="container"> | |
<div><img id="attn-map-img" src="./figures/attn_map.png" /></div> | |
</div> | |
<p> | |
In the figure (a), we visualize the attention maps of the last token in the input prompt for normal and attack | |
data. We observe that the attention maps for normal data are much darker than those for attacked data, | |
particularly in the middle and earlier layers of the LLM. This indicates that the last token's attention to the | |
instruction is significantly higher for normal data than for attack data in specific attention heads. When | |
inputting attacked data, the attention shifts away from the original instruction towards the attack data, which we | |
refer to as the <strong>distraction effect</strong>. | |
Additionally, in the figure (b), we find that the attention focus shifts from the original instruction to the | |
injected instruction in the attack data. This suggests that the separator string helps the attacker shift | |
attention to the injected instruction, causing the LLM to perform the injected task instead of the target task. | |
</p> | |
</div> | |
<h2 id="proposed-approach-attention-tracker" class="section-title">Proposed Approach: Attention Tracker</h2> | |
<p> With the discover of distraction effect, we propose <strong>Attention Tracker</strong>, | |
a prompt injection detection method based on tracking the attention pattern on instruction. Our detection | |
procedure is shown below: | |
</p> | |
<div class="container"><img id="attention-tracker-header" src="./figures/main.png" /></div> | |
<p></p> | |
<p> | |
Attention Tracker can be summarized into two phases: | |
</p> | |
<p> | |
<strong>(Phase 1) Finding Important Heads:</strong> In the first step, we identify specific attention head that | |
that exhibit the distraction effect, which we termed the important heads. To find the important heads, we use a | |
set of LLM-generated sentences with the ignore attack as the dataset. | |
</p> | |
<p> | |
<strong>(Phase 2) Prompt Injection Detection with Important Heads:</strong> In the second step, we feed the | |
testing quries into the target LLM and aggregate the attention directed towards the instruction in the important | |
heads. With this aggregated score which we call the <strong>focus score</strong>, we can effectively detect prompt | |
injection attacks. | |
</p> | |
<p> | |
We provide more details about the running flow of Attention Tracker in the paper. | |
</p> | |
<h2 id="result-attention-tracker" class="section-title">Experiment Result</h2> | |
<p> | |
In this section, we evaluate Attention Tracker against various baselines with the AUROC score on two prompt | |
injection detection benchmarks: Open-Prompt-Injection and deepset prompt injection dataset: | |
</p> | |
<div class="container"><img id="attention-tracker-header" src="./figures/result.png" /></div> | |
<p /> | |
<p> | |
As shown in the table, Attention Tracker consistently outperforms existing baselines, with an AUROC improvement of | |
up to 3.1% on the Open-Prompt-Injection benchmark and 10.0% on the deepset prompt injection dataset. Among | |
training-free methods, it achieves even greater gains, with an average AUROC improvement of 31.3% and 20.9% across | |
the two datasets, respectively. Unlike LLM-based methods that rely on larger models for stability, Attention | |
Tracker delivers robust and effective performance even with smaller LLMs, underscoring its suitability for | |
real-world applications. | |
</p> | |
<h2 id="demo" class="section-title">Example</h2> | |
<p> | |
We evaluated the effectiveness of the Attention Tracker by visualizing the distribution of attention aggregation | |
for key heads across different data types (normal data vs. attack data) in the Open-Prompt-Injection dataset. | |
Additionally, we calculated the focus score for these data samples. A higher focus score indicates a lower | |
likelihood of prompt injection attacks. The tested model is Qwen-2 1.8b. | |
</p> | |
<div class="group-title green">Normal Data</div> | |
<div class="image-gallery-container normal-gallery"> | |
<span class="arrow left-arrow" onclick="navigateImages('normal', -1)"><</span> | |
<div class="image-gallery"> | |
<!-- <img id="normalImage1" src="./demo_results/normal_1.png" alt="Normal Image 1"> --> | |
<img id="normalImage2" src="./demo_results/normal_2.png" alt="Normal Image 2"> | |
<img id="normalImage3" src="./demo_results/normal_3.png" alt="Normal Image 3"> | |
<img id="normalImage4" src="./demo_results/normal_4.png" alt="Normal Image 4"> | |
<img id="normalImage5" src="./demo_results/normal_5.png" alt="Normal Image 5"> | |
</div> | |
<span class="arrow right-arrow" onclick="navigateImages('normal', 1)">></span> | |
</div> | |
<div class="group-title red">Attack Data</div> | |
<div class="image-gallery-container attack-gallery"> | |
<span class="arrow left-arrow" onclick="navigateImages('attack', -1)"><</span> | |
<div class="image-gallery"> | |
<!-- <img id="attackImage1" src="./demo_results/attack_1.png" alt="Attack Image 1" class="active"> --> | |
<img id="attackImage2" src="./demo_results/attack_2.png" alt="Attack Image 2"> | |
<img id="attackImage3" src="./demo_results/attack_3.png" alt="Attack Image 3"> | |
<img id="attackImage4" src="./demo_results/attack_4.png" alt="Attack Image 4"> | |
<img id="attackImage5" src="./demo_results/attack_5.png" alt="Attack Image 5"> | |
</div> | |
<span class="arrow right-arrow" onclick="navigateImages('attack', 1)">></span> | |
</div> | |
<!-- <h2 id="inquiries" class="section-title"> Inquiries on Attention Tracker</h2> | |
<p class="section-title"> Please contact <a href="Mailto:khhung906@gmail.com">Kuo-Han Hung</a> | |
and <a href="Mailto:pin-yu.chen@ibm.com">Pin-Yu Chen</a> | |
</p> --> | |
<h2 id="citations" class="section-title">Citations</h2> | |
<p>If you find Attention Tracker helpful and useful for your research, please cite our main paper as follows:</p> | |
<div class="language-plaintext highlighter-rouge"> | |
<div class="highlight"> | |
<pre class="highlight"> | |
<code>@misc{hung2024attentiontrackerdetectingprompt, | |
title={Attention Tracker: Detecting Prompt Injection Attacks in LLMs}, | |
author={Kuo-Han Hung and Ching-Yun Ko and Ambrish Rawat and I-Hsin Chung and Winston H. Hsu and Pin-Yu Chen}, | |
year={2024}, | |
eprint={2411.00348}, | |
archivePrefix={arXiv}, | |
primaryClass={cs.CR}, | |
url={https://arxiv.org/abs/2411.00348}, | |
}</code></pre> | |
</div> | |
</div> | |
<footer class="site-footer"> | |
<span class="site-footer-owner">This website is maintained by <a href="https://khhung906.github.io/">Kuo-Han | |
Hung</a></a>.</span> | |
</footer> | |
</main> | |
</body> | |
</html> |