File size: 7,127 Bytes
dc68796
afc7627
dc68796
 
afc7627
dc68796
 
afc7627
dc68796
 
 
b375146
dc68796
 
 
 
 
 
 
 
 
694cb16
82428f3
 
dc68796
82428f3
 
 
 
2751a3d
82428f3
 
 
 
 
eacf5c6
dc68796
 
82428f3
 
 
 
41c2afe
82428f3
694cb16
 
 
2751a3d
82428f3
41c2afe
82428f3
41c2afe
af2b329
9e041f4
dc68796
82428f3
dc68796
 
82428f3
d258ead
82428f3
e484946
 
82428f3
dc68796
8fb2f84
 
 
 
 
 
 
 
 
 
af2b329
 
9efd8ca
dc68796
82428f3
dc68796
8fb2f84
 
 
 
 
 
 
 
 
 
af2b329
 
8fb2f84
82428f3
8fb2f84
 
 
 
 
 
 
 
 
 
af2b329
 
9efd8ca
dc68796
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
$(document).ready(function(){
    $('#jailbreak-intro').on('click', 'span', function(e) {
        e.preventDefault();
        if (!$(this).hasClass('selected')) {
            $('#jailbreak-intro-label > span').removeClass('selected');
            $(this).addClass('selected');
            let cur_img = $(this).text();
            var diagram = document.getElementById("jailbreak-intro-img");
            diagram.src = "images/metrics/intro-"+cur_img+".png";
        }
    });
    $('#refusal-loss-formula-list').on('click', 'a', function(e) {
        e.preventDefault();
        if (!$(this).hasClass('selected')) {
            $('.formula').hide(200);
            $('.formula-list > a').removeClass('selected');
            $(this).addClass('selected');
            var target = $(this).attr('href');
            $(target).show(200);
        }
    });

    var diagram = document.getElementById("reliability-diagram"), jailbreak_results;
    $.getJSON("demo_results/jailbreak_defense_results.jsonl", function(data) {jailbreak_results = data;});
    $('input[type=radio][name=models]').change(function() {diagram.src = updateFigurePath();});
    $('input[type=radio][name=defense]').change(function() {
        if(this.value == 'ppl') {
          $("#ppl-slider").slider('enable');
          $("#gradient-norm-slider").slider('disable');
        }else if (this.value== 'gradient_cuff'){
          $("#ppl-slider").slider('disable');
          $("#gradient-norm-slider").slider('enable');
        }else{
          $("#ppl-slider").slider('disable');
          $("#gradient-norm-slider").slider('disable');
        }
        diagram.src = updateFigurePath();
    });
    var bin_handle = $("#gradient-norm-threshold");
    $( "#gradient-norm-slider" ).slider({
        step: 50,
        min: 0,
        max: 800,
        value: 100,
        create: function() {bin_handle.text($(this).slider("value"));},
        slide: function(event, ui) {bin_handle.text(ui.value);diagram.src = updateFigurePath(ui.value);},
    });
    var ts_handle = $("#ppl-threshold");
    $("#ppl-slider").slider({
        step: 20,
        min: 0,
        max: 200,
        value: 100,
        disabled: true,
        create: function() {ts_handle.text($(this).slider("value"));},
        slide: function( event, ui ) {ts_handle.text(ui.value);diagram.src = updateFigurePath($('#gradient-norm-slider').slider("option", "value"),ui.value);},
    });

    function updateFigurePath(gradient_norm_threshold=$('#gradient-norm-slider').slider("option", "value"),ppl_threshold=$('#ppl-slider').slider("option", "value")) {
        var cur_model = $("input[type='radio'][name='models']:checked").val(),
            cur_defense = $("input[type='radio'][name='defense']:checked").val();
        var asr_board = document.getElementById("asr-value");
        var brr_board = document.getElementById("brr-value");
        if (cur_defense === "ppl")
        {
          var asr_gcg=jailbreak_results[cur_defense][cur_model]["adv_harmful_behavior"][ppl_threshold.toString()]["tpr"];
          var asr_autodan=jailbreak_results[cur_defense][cur_model]["autodan"][ppl_threshold.toString()]["tpr"];
          var asr_pair=jailbreak_results[cur_defense][cur_model]["pair"][ppl_threshold.toString()]["tpr"];
          var asr_tap=jailbreak_results[cur_defense][cur_model]["tap"][ppl_threshold.toString()]["tpr"];
          var asr_base64=jailbreak_results[cur_defense][cur_model]["base64"][ppl_threshold.toString()]["tpr"];
          var asr_lrl_de=jailbreak_results[cur_defense][cur_model]["lrl_de"][ppl_threshold.toString()]["tpr"];
          var asr_lrl_fr=jailbreak_results[cur_defense][cur_model]["lrl_fr"][ppl_threshold.toString()]["tpr"];
          var asr_lrl_zh_CN=jailbreak_results[cur_defense][cur_model]["lrl_zh-CN"][ppl_threshold.toString()]["tpr"];
          var asr_lrl_sv=jailbreak_results[cur_defense][cur_model]["lrl_sv"][ppl_threshold.toString()]["tpr"];
          var asr=(asr_gcg+asr_autodan+asr_pair+asr_tap+asr_base64+(asr_lrl_de+asr_lrl_fr+asr_lrl_zh_CN+asr_lrl_sv)/4)/6
          asr_board.innerText = parseFloat(asr).toFixed(3);
          brr_board.innerText = parseFloat(jailbreak_results[cur_defense][cur_model]["pair"][ppl_threshold.toString()]["fpr"]).toFixed(3);
          return "demo_results/ppl_"+cur_model+"_threshold_"+ppl_threshold.toString()+".png"
        }
        else if (cur_defense === "gradient_cuff")
        {
          var asr_gcg=jailbreak_results[cur_defense][cur_model]["adv_harmful_behavior"][gradient_norm_threshold.toString()]["tpr"];
          var asr_autodan=jailbreak_results[cur_defense][cur_model]["autodan"][gradient_norm_threshold.toString()]["tpr"];
          var asr_pair=jailbreak_results[cur_defense][cur_model]["pair"][gradient_norm_threshold.toString()]["tpr"];
          var asr_tap=jailbreak_results[cur_defense][cur_model]["tap"][gradient_norm_threshold.toString()]["tpr"];
          var asr_base64=jailbreak_results[cur_defense][cur_model]["base64"][gradient_norm_threshold.toString()]["tpr"];
          var asr_lrl_de=jailbreak_results[cur_defense][cur_model]["lrl_de"][gradient_norm_threshold.toString()]["tpr"];
          var asr_lrl_fr=jailbreak_results[cur_defense][cur_model]["lrl_fr"][gradient_norm_threshold.toString()]["tpr"];
          var asr_lrl_zh_CN=jailbreak_results[cur_defense][cur_model]["lrl_zh-CN"][gradient_norm_threshold.toString()]["tpr"];
          var asr_lrl_sv=jailbreak_results[cur_defense][cur_model]["lrl_sv"][gradient_norm_threshold.toString()]["tpr"];
          var asr=(asr_gcg+asr_autodan+asr_pair+asr_tap+asr_base64+(asr_lrl_de+asr_lrl_fr+asr_lrl_zh_CN+asr_lrl_sv)/4)/6
          asr_board.innerText = parseFloat(asr).toFixed(3);
          brr_board.innerText = parseFloat(jailbreak_results[cur_defense][cur_model]["pair"][gradient_norm_threshold.toString()]["fpr"]).toFixed(3);
          return "demo_results/gradient_cuff_"+cur_model+"_threshold_"+gradient_norm_threshold.toString()+".png"
        }else{
          var asr_gcg=jailbreak_results[cur_defense][cur_model]["adv_harmful_behavior"]["tpr"];
          var asr_autodan=jailbreak_results[cur_defense][cur_model]["autodan"]["tpr"];
          var asr_pair=jailbreak_results[cur_defense][cur_model]["pair"]["tpr"];
          var asr_tap=jailbreak_results[cur_defense][cur_model]["tap"]["tpr"];
          var asr_base64=jailbreak_results[cur_defense][cur_model]["base64"]["tpr"];
          var asr_lrl_de=jailbreak_results[cur_defense][cur_model]["lrl_de"]["tpr"];
          var asr_lrl_fr=jailbreak_results[cur_defense][cur_model]["lrl_fr"]["tpr"];
          var asr_lrl_zh_CN=jailbreak_results[cur_defense][cur_model]["lrl_zh-CN"]["tpr"];
          var asr_lrl_sv=jailbreak_results[cur_defense][cur_model]["lrl_sv"]["tpr"];
          var asr=(asr_gcg+asr_autodan+asr_pair+asr_tap+asr_base64+(asr_lrl_de+asr_lrl_fr+asr_lrl_zh_CN+asr_lrl_sv)/4)/6
          asr_board.innerText = parseFloat(asr).toFixed(3);
            brr_board.innerText = parseFloat(jailbreak_results[cur_defense][cur_model]["pair"]["fpr"]).toFixed(3);
            return "demo_results/"+cur_defense+"_"+cur_model+".png"
        }
    }
});