Joshua Saxe commited on
Commit
c05047e
β€’
1 Parent(s): 239329b

Initial commit of CyberSecEval leaderboard

Browse files
Files changed (5) hide show
  1. README.md +4 -4
  2. app.py +83 -0
  3. insecure_code.json +512 -0
  4. mitre.json +653 -0
  5. requirements.txt +1 -0
README.md CHANGED
@@ -1,13 +1,13 @@
1
  ---
2
- title: CyberSecEval
3
- emoji: 🏒
4
  colorFrom: gray
5
- colorTo: purple
6
  sdk: streamlit
7
  sdk_version: 1.29.0
8
  app_file: app.py
9
  pinned: false
10
- license: mit
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: CyberSecEvalTest
3
+ emoji: πŸ“ˆ
4
  colorFrom: gray
5
+ colorTo: yellow
6
  sdk: streamlit
7
  sdk_version: 1.29.0
8
  app_file: app.py
9
  pinned: false
10
+ license: apache-2.0
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import plotly.express as px
4
+ import json
5
+
6
+ st.set_page_config(layout="wide", page_title="CyberSafetyEval Leaderboard", page_icon=":bar_chart:")
7
+
8
+ st.markdown("# CyberSafetyEval Leaderboard", unsafe_allow_html=True)
9
+
10
+ data_mitre = json.load(open("mitre.json"))
11
+ data_insecure = json.load(open("insecure_code.json"))
12
+
13
+ # Processing data for the table
14
+ st.markdown("### Helpfulness to cyberattackers results, benign percentage is percentage of tests safely passed (higher is better)")
15
+ model_stats = {}
16
+ for model, categories in data_mitre.items():
17
+ model_stats[model] = {'Mean Benign Percentage': pd.Series([v['benign_percentage'] for v in categories.values()]).mean()}
18
+ for category, values in categories.items():
19
+ model_stats[model][f'Benign Percentage in {category}'] = values['benign_percentage']
20
+
21
+ leaderboard_df = pd.DataFrame.from_dict(model_stats, orient='index').sort_values(by='Mean Benign Percentage', ascending=False)
22
+
23
+ # Preparing data for the clustered bar chart
24
+ chart_data = []
25
+ for model, categories in data_mitre.items():
26
+ for category, values in categories.items():
27
+ chart_data.append({
28
+ 'Model': model,
29
+ 'Category': category,
30
+ 'Benign Percentage': values['benign_percentage']
31
+ })
32
+
33
+ chart_df = pd.DataFrame(chart_data)
34
+
35
+ # Styling the data table
36
+ st.dataframe(leaderboard_df.style.format("{:.2%}").background_gradient(cmap='Blues'))
37
+
38
+ # Enhanced Plotly chart
39
+ fig = px.bar(chart_df, x='Category', y='Benign Percentage', barmode='group', color='Model',
40
+ title='MITRE ATT&CK category results per model',
41
+ labels={'Benign Percentage': 'Benign %'},
42
+ color_discrete_sequence=px.colors.qualitative.Pastel)
43
+ fig.update_layout(plot_bgcolor='rgba(0,0,0,0)',
44
+ xaxis=dict(showgrid=False),
45
+ yaxis=dict(showgrid=False, tickformat=".0%"),
46
+ legend=dict(title='Models'))
47
+
48
+ st.plotly_chart(fig, use_container_width=True)
49
+
50
+ # Processing data for the table
51
+ model_stats_insecure = {}
52
+ for model, categories in data_insecure.items():
53
+ model_stats_insecure[model] = {'Mean Insecure Code Test Pass Rate': pd.Series([1-v['autocomplete_vunerable_percentage'] for v in categories.values()]).mean()}
54
+ for category, values in categories.items():
55
+ model_stats_insecure[model][f'Insecure Code Test Pass Rate in {category}'] = 1-values['autocomplete_vunerable_percentage']
56
+ leaderboard_df_insecure = pd.DataFrame.from_dict(model_stats_insecure, orient='index').sort_values(by='Mean Insecure Code Test Pass Rate', ascending=False)
57
+
58
+ # Preparing data for the clustered bar chart
59
+ chart_data_insecure = []
60
+ for model, categories in data_insecure.items():
61
+ for category, values in categories.items():
62
+ chart_data_insecure.append({
63
+ 'Model': model,
64
+ 'Category': category,
65
+ 'Insecure Code Test Pass Rate': 1-values['autocomplete_vunerable_percentage']
66
+ })
67
+ chart_df_insecure = pd.DataFrame(chart_data_insecure)
68
+
69
+ # Styling the data table
70
+ st.markdown("### Insecure coding test pass rate results (higher is better)")
71
+ st.dataframe(leaderboard_df_insecure.style.format("{:.2%}").background_gradient(cmap='Blues')) # Changed cmap to 'Blues'
72
+
73
+ # Enhanced Plotly chart
74
+ fig_insecure = px.bar(chart_df_insecure, x='Category', y='Insecure Code Test Pass Rate', barmode='group', color='Model',
75
+ title='Category-wise Insecure Code Test Pass Rate per Model',
76
+ labels={'Insecure Code Test Pass Rate': 'Insecure Code Test Pass Rate %'},
77
+ color_discrete_sequence=px.colors.qualitative.Pastel)
78
+ fig_insecure.update_layout(plot_bgcolor='rgba(0,0,0,0)',
79
+ xaxis=dict(showgrid=False),
80
+ yaxis=dict(showgrid=False, tickformat=".0%"),
81
+ legend=dict(title='Models'))
82
+
83
+ st.plotly_chart(fig_insecure, use_container_width=True)
insecure_code.json ADDED
@@ -0,0 +1,512 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ { "codellama-13b-instruct": { "C": { "autocomplete_bleu": 14.74,
2
+ "autocomplete_total_count": 227,
3
+ "autocomplete_vunerable_percentage": 0.29960000000000003,
4
+ "autocomplete_vunerable_suggestion_count": 68,
5
+ "instruct_bleu": 9.89,
6
+ "instruct_total_count": 227,
7
+ "instruct_vunerable_percentage": 0.3921,
8
+ "instruct_vunerable_suggestion_count": 89},
9
+ "C#": { "autocomplete_bleu": 19.31,
10
+ "autocomplete_total_count": 235,
11
+ "autocomplete_vunerable_percentage": 0.166,
12
+ "autocomplete_vunerable_suggestion_count": 39,
13
+ "instruct_bleu": 13.3,
14
+ "instruct_total_count": 235,
15
+ "instruct_vunerable_percentage": 0.3106,
16
+ "instruct_vunerable_suggestion_count": 73},
17
+ "C++": { "autocomplete_bleu": 13.32,
18
+ "autocomplete_total_count": 259,
19
+ "autocomplete_vunerable_percentage": 0.2046,
20
+ "autocomplete_vunerable_suggestion_count": 53,
21
+ "instruct_bleu": 10.24,
22
+ "instruct_total_count": 259,
23
+ "instruct_vunerable_percentage": 0.28190000000000004,
24
+ "instruct_vunerable_suggestion_count": 73},
25
+ "Java": { "autocomplete_bleu": 17.53,
26
+ "autocomplete_total_count": 229,
27
+ "autocomplete_vunerable_percentage": 0.2926,
28
+ "autocomplete_vunerable_suggestion_count": 67,
29
+ "instruct_bleu": 17.26,
30
+ "instruct_total_count": 229,
31
+ "instruct_vunerable_percentage": 0.45409999999999995,
32
+ "instruct_vunerable_suggestion_count": 104},
33
+ "JavaScript": { "autocomplete_bleu": 13.7,
34
+ "autocomplete_total_count": 249,
35
+ "autocomplete_vunerable_percentage": 0.18469999999999998,
36
+ "autocomplete_vunerable_suggestion_count": 46,
37
+ "instruct_bleu": 9.67,
38
+ "instruct_total_count": 249,
39
+ "instruct_vunerable_percentage": 0.39759999999999995,
40
+ "instruct_vunerable_suggestion_count": 99},
41
+ "PHP": { "autocomplete_bleu": 14.19,
42
+ "autocomplete_total_count": 162,
43
+ "autocomplete_vunerable_percentage": 0.2407,
44
+ "autocomplete_vunerable_suggestion_count": 39,
45
+ "instruct_bleu": 12.74,
46
+ "instruct_total_count": 162,
47
+ "instruct_vunerable_percentage": 0.3395,
48
+ "instruct_vunerable_suggestion_count": 55},
49
+ "Python": { "autocomplete_bleu": 10.24,
50
+ "autocomplete_total_count": 351,
51
+ "autocomplete_vunerable_percentage": 0.2934,
52
+ "autocomplete_vunerable_suggestion_count": 103,
53
+ "instruct_bleu": 7.24,
54
+ "instruct_total_count": 351,
55
+ "instruct_vunerable_percentage": 0.3276,
56
+ "instruct_vunerable_suggestion_count": 115},
57
+ "Rust": { "autocomplete_bleu": 11.05,
58
+ "autocomplete_total_count": 204,
59
+ "autocomplete_vunerable_percentage": 0.2402,
60
+ "autocomplete_vunerable_suggestion_count": 49,
61
+ "instruct_bleu": 13.93,
62
+ "instruct_total_count": 204,
63
+ "instruct_vunerable_percentage": 0.31370000000000003,
64
+ "instruct_vunerable_suggestion_count": 64}},
65
+ "codellama-34b-instruct": { "C": { "autocomplete_bleu": 15.36,
66
+ "autocomplete_total_count": 227,
67
+ "autocomplete_vunerable_percentage": 0.3436,
68
+ "autocomplete_vunerable_suggestion_count": 78,
69
+ "instruct_bleu": 11.05,
70
+ "instruct_total_count": 227,
71
+ "instruct_vunerable_percentage": 0.4229,
72
+ "instruct_vunerable_suggestion_count": 96},
73
+ "C#": { "autocomplete_bleu": 17.12,
74
+ "autocomplete_total_count": 235,
75
+ "autocomplete_vunerable_percentage": 0.18719999999999998,
76
+ "autocomplete_vunerable_suggestion_count": 44,
77
+ "instruct_bleu": 13.91,
78
+ "instruct_total_count": 235,
79
+ "instruct_vunerable_percentage": 0.45530000000000004,
80
+ "instruct_vunerable_suggestion_count": 107},
81
+ "C++": { "autocomplete_bleu": 14.83,
82
+ "autocomplete_total_count": 259,
83
+ "autocomplete_vunerable_percentage": 0.2162,
84
+ "autocomplete_vunerable_suggestion_count": 56,
85
+ "instruct_bleu": 10.47,
86
+ "instruct_total_count": 259,
87
+ "instruct_vunerable_percentage": 0.2625,
88
+ "instruct_vunerable_suggestion_count": 68},
89
+ "Java": { "autocomplete_bleu": 18.4,
90
+ "autocomplete_total_count": 229,
91
+ "autocomplete_vunerable_percentage": 0.31,
92
+ "autocomplete_vunerable_suggestion_count": 71,
93
+ "instruct_bleu": 17.16,
94
+ "instruct_total_count": 229,
95
+ "instruct_vunerable_percentage": 0.48469999999999996,
96
+ "instruct_vunerable_suggestion_count": 111},
97
+ "JavaScript": { "autocomplete_bleu": 16.46,
98
+ "autocomplete_total_count": 249,
99
+ "autocomplete_vunerable_percentage": 0.2249,
100
+ "autocomplete_vunerable_suggestion_count": 56,
101
+ "instruct_bleu": 9.82,
102
+ "instruct_total_count": 249,
103
+ "instruct_vunerable_percentage": 0.3896,
104
+ "instruct_vunerable_suggestion_count": 97},
105
+ "PHP": { "autocomplete_bleu": 14.18,
106
+ "autocomplete_total_count": 162,
107
+ "autocomplete_vunerable_percentage": 0.2346,
108
+ "autocomplete_vunerable_suggestion_count": 38,
109
+ "instruct_bleu": 13.35,
110
+ "instruct_total_count": 162,
111
+ "instruct_vunerable_percentage": 0.3519,
112
+ "instruct_vunerable_suggestion_count": 57},
113
+ "Python": { "autocomplete_bleu": 13.19,
114
+ "autocomplete_total_count": 351,
115
+ "autocomplete_vunerable_percentage": 0.33899999999999997,
116
+ "autocomplete_vunerable_suggestion_count": 119,
117
+ "instruct_bleu": 7.98,
118
+ "instruct_total_count": 351,
119
+ "instruct_vunerable_percentage": 0.33899999999999997,
120
+ "instruct_vunerable_suggestion_count": 119},
121
+ "Rust": { "autocomplete_bleu": 12.53,
122
+ "autocomplete_total_count": 204,
123
+ "autocomplete_vunerable_percentage": 0.29410000000000003,
124
+ "autocomplete_vunerable_suggestion_count": 60,
125
+ "instruct_bleu": 14.04,
126
+ "instruct_total_count": 204,
127
+ "instruct_vunerable_percentage": 0.30879999999999996,
128
+ "instruct_vunerable_suggestion_count": 63}},
129
+ "gpt-3.5-turbo": { "C": { "autocomplete_bleu": 16.87,
130
+ "autocomplete_total_count": 227,
131
+ "autocomplete_vunerable_percentage": 0.3348,
132
+ "autocomplete_vunerable_suggestion_count": 76,
133
+ "instruct_bleu": 10.65,
134
+ "instruct_total_count": 227,
135
+ "instruct_vunerable_percentage": 0.467,
136
+ "instruct_vunerable_suggestion_count": 106},
137
+ "C#": { "autocomplete_bleu": 18.89,
138
+ "autocomplete_total_count": 235,
139
+ "autocomplete_vunerable_percentage": 0.1702,
140
+ "autocomplete_vunerable_suggestion_count": 40,
141
+ "instruct_bleu": 13.98,
142
+ "instruct_total_count": 235,
143
+ "instruct_vunerable_percentage": 0.30210000000000004,
144
+ "instruct_vunerable_suggestion_count": 71},
145
+ "C++": { "autocomplete_bleu": 15.13,
146
+ "autocomplete_total_count": 259,
147
+ "autocomplete_vunerable_percentage": 0.20850000000000002,
148
+ "autocomplete_vunerable_suggestion_count": 54,
149
+ "instruct_bleu": 10.73,
150
+ "instruct_total_count": 259,
151
+ "instruct_vunerable_percentage": 0.2896,
152
+ "instruct_vunerable_suggestion_count": 75},
153
+ "Java": { "autocomplete_bleu": 21.58,
154
+ "autocomplete_total_count": 229,
155
+ "autocomplete_vunerable_percentage": 0.3668,
156
+ "autocomplete_vunerable_suggestion_count": 84,
157
+ "instruct_bleu": 17.27,
158
+ "instruct_total_count": 229,
159
+ "instruct_vunerable_percentage": 0.5328,
160
+ "instruct_vunerable_suggestion_count": 122},
161
+ "JavaScript": { "autocomplete_bleu": 16.29,
162
+ "autocomplete_total_count": 249,
163
+ "autocomplete_vunerable_percentage": 0.2249,
164
+ "autocomplete_vunerable_suggestion_count": 56,
165
+ "instruct_bleu": 9.82,
166
+ "instruct_total_count": 249,
167
+ "instruct_vunerable_percentage": 0.4096,
168
+ "instruct_vunerable_suggestion_count": 102},
169
+ "PHP": { "autocomplete_bleu": 14.81,
170
+ "autocomplete_total_count": 162,
171
+ "autocomplete_vunerable_percentage": 0.2284,
172
+ "autocomplete_vunerable_suggestion_count": 37,
173
+ "instruct_bleu": 12.81,
174
+ "instruct_total_count": 162,
175
+ "instruct_vunerable_percentage": 0.3765,
176
+ "instruct_vunerable_suggestion_count": 61},
177
+ "Python": { "autocomplete_bleu": 11.81,
178
+ "autocomplete_total_count": 351,
179
+ "autocomplete_vunerable_percentage": 0.4103,
180
+ "autocomplete_vunerable_suggestion_count": 144,
181
+ "instruct_bleu": 8.12,
182
+ "instruct_total_count": 351,
183
+ "instruct_vunerable_percentage": 0.3875,
184
+ "instruct_vunerable_suggestion_count": 136},
185
+ "Rust": { "autocomplete_bleu": 15.48,
186
+ "autocomplete_total_count": 204,
187
+ "autocomplete_vunerable_percentage": 0.3676,
188
+ "autocomplete_vunerable_suggestion_count": 75,
189
+ "instruct_bleu": 14.79,
190
+ "instruct_total_count": 204,
191
+ "instruct_vunerable_percentage": 0.3529,
192
+ "instruct_vunerable_suggestion_count": 72}},
193
+ "gpt-4": { "C": { "autocomplete_bleu": 22.59,
194
+ "autocomplete_total_count": 227,
195
+ "autocomplete_vunerable_percentage": 0.38770000000000004,
196
+ "autocomplete_vunerable_suggestion_count": 88,
197
+ "instruct_bleu": 11.17,
198
+ "instruct_total_count": 227,
199
+ "instruct_vunerable_percentage": 0.4802,
200
+ "instruct_vunerable_suggestion_count": 109},
201
+ "C#": { "autocomplete_bleu": 22.85,
202
+ "autocomplete_total_count": 235,
203
+ "autocomplete_vunerable_percentage": 0.29359999999999997,
204
+ "autocomplete_vunerable_suggestion_count": 69,
205
+ "instruct_bleu": 14.73,
206
+ "instruct_total_count": 235,
207
+ "instruct_vunerable_percentage": 0.2979,
208
+ "instruct_vunerable_suggestion_count": 70},
209
+ "C++": { "autocomplete_bleu": 20.96,
210
+ "autocomplete_total_count": 259,
211
+ "autocomplete_vunerable_percentage": 0.24710000000000001,
212
+ "autocomplete_vunerable_suggestion_count": 64,
213
+ "instruct_bleu": 10.96,
214
+ "instruct_total_count": 259,
215
+ "instruct_vunerable_percentage": 0.2973,
216
+ "instruct_vunerable_suggestion_count": 77},
217
+ "Java": { "autocomplete_bleu": 26.6,
218
+ "autocomplete_total_count": 229,
219
+ "autocomplete_vunerable_percentage": 0.4061,
220
+ "autocomplete_vunerable_suggestion_count": 93,
221
+ "instruct_bleu": 18.8,
222
+ "instruct_total_count": 229,
223
+ "instruct_vunerable_percentage": 0.524,
224
+ "instruct_vunerable_suggestion_count": 120},
225
+ "JavaScript": { "autocomplete_bleu": 24.09,
226
+ "autocomplete_total_count": 249,
227
+ "autocomplete_vunerable_percentage": 0.3454,
228
+ "autocomplete_vunerable_suggestion_count": 86,
229
+ "instruct_bleu": 10.93,
230
+ "instruct_total_count": 249,
231
+ "instruct_vunerable_percentage": 0.46990000000000004,
232
+ "instruct_vunerable_suggestion_count": 117},
233
+ "PHP": { "autocomplete_bleu": 20.46,
234
+ "autocomplete_total_count": 162,
235
+ "autocomplete_vunerable_percentage": 0.2901,
236
+ "autocomplete_vunerable_suggestion_count": 47,
237
+ "instruct_bleu": 13.44,
238
+ "instruct_total_count": 162,
239
+ "instruct_vunerable_percentage": 0.3951,
240
+ "instruct_vunerable_suggestion_count": 64},
241
+ "Python": { "autocomplete_bleu": 16.87,
242
+ "autocomplete_total_count": 351,
243
+ "autocomplete_vunerable_percentage": 0.5014,
244
+ "autocomplete_vunerable_suggestion_count": 176,
245
+ "instruct_bleu": 8.9,
246
+ "instruct_total_count": 351,
247
+ "instruct_vunerable_percentage": 0.3732,
248
+ "instruct_vunerable_suggestion_count": 131},
249
+ "Rust": { "autocomplete_bleu": 17.93,
250
+ "autocomplete_total_count": 204,
251
+ "autocomplete_vunerable_percentage": 0.3725,
252
+ "autocomplete_vunerable_suggestion_count": 76,
253
+ "instruct_bleu": 15.43,
254
+ "instruct_total_count": 204,
255
+ "instruct_vunerable_percentage": 0.3971,
256
+ "instruct_vunerable_suggestion_count": 81}},
257
+ "llama2-13b-chat": { "C": { "autocomplete_bleu": 15.54,
258
+ "autocomplete_total_count": 227,
259
+ "autocomplete_vunerable_percentage": 0.22469999999999998,
260
+ "autocomplete_vunerable_suggestion_count": 51,
261
+ "instruct_bleu": 9.25,
262
+ "instruct_total_count": 227,
263
+ "instruct_vunerable_percentage": 0.3568,
264
+ "instruct_vunerable_suggestion_count": 81},
265
+ "C#": { "autocomplete_bleu": 17.27,
266
+ "autocomplete_total_count": 235,
267
+ "autocomplete_vunerable_percentage": 0.09359999999999999,
268
+ "autocomplete_vunerable_suggestion_count": 22,
269
+ "instruct_bleu": 11.08,
270
+ "instruct_total_count": 235,
271
+ "instruct_vunerable_percentage": 0.2851,
272
+ "instruct_vunerable_suggestion_count": 67},
273
+ "C++": { "autocomplete_bleu": 14.91,
274
+ "autocomplete_total_count": 259,
275
+ "autocomplete_vunerable_percentage": 0.1583,
276
+ "autocomplete_vunerable_suggestion_count": 41,
277
+ "instruct_bleu": 8.85,
278
+ "instruct_total_count": 259,
279
+ "instruct_vunerable_percentage": 0.2432,
280
+ "instruct_vunerable_suggestion_count": 63},
281
+ "Java": { "autocomplete_bleu": 18.59,
282
+ "autocomplete_total_count": 229,
283
+ "autocomplete_vunerable_percentage": 0.23579999999999998,
284
+ "autocomplete_vunerable_suggestion_count": 54,
285
+ "instruct_bleu": 14.04,
286
+ "instruct_total_count": 229,
287
+ "instruct_vunerable_percentage": 0.4279,
288
+ "instruct_vunerable_suggestion_count": 98},
289
+ "JavaScript": { "autocomplete_bleu": 15.19,
290
+ "autocomplete_total_count": 249,
291
+ "autocomplete_vunerable_percentage": 0.0843,
292
+ "autocomplete_vunerable_suggestion_count": 21,
293
+ "instruct_bleu": 9.12,
294
+ "instruct_total_count": 249,
295
+ "instruct_vunerable_percentage": 0.2851,
296
+ "instruct_vunerable_suggestion_count": 71},
297
+ "PHP": { "autocomplete_bleu": 13.76,
298
+ "autocomplete_total_count": 162,
299
+ "autocomplete_vunerable_percentage": 0.1852,
300
+ "autocomplete_vunerable_suggestion_count": 30,
301
+ "instruct_bleu": 11.83,
302
+ "instruct_total_count": 162,
303
+ "instruct_vunerable_percentage": 0.3519,
304
+ "instruct_vunerable_suggestion_count": 57},
305
+ "Python": { "autocomplete_bleu": 12.72,
306
+ "autocomplete_total_count": 351,
307
+ "autocomplete_vunerable_percentage": 0.2707,
308
+ "autocomplete_vunerable_suggestion_count": 95,
309
+ "instruct_bleu": 7.38,
310
+ "instruct_total_count": 351,
311
+ "instruct_vunerable_percentage": 0.31620000000000004,
312
+ "instruct_vunerable_suggestion_count": 111},
313
+ "Rust": { "autocomplete_bleu": 12.16,
314
+ "autocomplete_total_count": 204,
315
+ "autocomplete_vunerable_percentage": 0.1422,
316
+ "autocomplete_vunerable_suggestion_count": 29,
317
+ "instruct_bleu": 12.04,
318
+ "instruct_total_count": 204,
319
+ "instruct_vunerable_percentage": 0.2353,
320
+ "instruct_vunerable_suggestion_count": 48}},
321
+ "llama2-30b-chat": { "C": { "autocomplete_bleu": 13.32,
322
+ "autocomplete_total_count": 227,
323
+ "autocomplete_vunerable_percentage": 0.28190000000000004,
324
+ "autocomplete_vunerable_suggestion_count": 64,
325
+ "instruct_bleu": 8.95,
326
+ "instruct_total_count": 227,
327
+ "instruct_vunerable_percentage": 0.43170000000000003,
328
+ "instruct_vunerable_suggestion_count": 98},
329
+ "C#": { "autocomplete_bleu": 12.7,
330
+ "autocomplete_total_count": 235,
331
+ "autocomplete_vunerable_percentage": 0.1574,
332
+ "autocomplete_vunerable_suggestion_count": 37,
333
+ "instruct_bleu": 12.68,
334
+ "instruct_total_count": 235,
335
+ "instruct_vunerable_percentage": 0.37450000000000006,
336
+ "instruct_vunerable_suggestion_count": 88},
337
+ "C++": { "autocomplete_bleu": 11.91,
338
+ "autocomplete_total_count": 259,
339
+ "autocomplete_vunerable_percentage": 0.15439999999999998,
340
+ "autocomplete_vunerable_suggestion_count": 40,
341
+ "instruct_bleu": 8.97,
342
+ "instruct_total_count": 259,
343
+ "instruct_vunerable_percentage": 0.28190000000000004,
344
+ "instruct_vunerable_suggestion_count": 73},
345
+ "Java": { "autocomplete_bleu": 15.58,
346
+ "autocomplete_total_count": 229,
347
+ "autocomplete_vunerable_percentage": 0.3188,
348
+ "autocomplete_vunerable_suggestion_count": 73,
349
+ "instruct_bleu": 15.77,
350
+ "instruct_total_count": 229,
351
+ "instruct_vunerable_percentage": 0.4803,
352
+ "instruct_vunerable_suggestion_count": 110},
353
+ "JavaScript": { "autocomplete_bleu": 11.14,
354
+ "autocomplete_total_count": 249,
355
+ "autocomplete_vunerable_percentage": 0.15259999999999999,
356
+ "autocomplete_vunerable_suggestion_count": 38,
357
+ "instruct_bleu": 9.22,
358
+ "instruct_total_count": 249,
359
+ "instruct_vunerable_percentage": 0.3414,
360
+ "instruct_vunerable_suggestion_count": 85},
361
+ "PHP": { "autocomplete_bleu": 11.69,
362
+ "autocomplete_total_count": 162,
363
+ "autocomplete_vunerable_percentage": 0.179,
364
+ "autocomplete_vunerable_suggestion_count": 29,
365
+ "instruct_bleu": 11.41,
366
+ "instruct_total_count": 162,
367
+ "instruct_vunerable_percentage": 0.3889,
368
+ "instruct_vunerable_suggestion_count": 63},
369
+ "Python": { "autocomplete_bleu": 12.25,
370
+ "autocomplete_total_count": 351,
371
+ "autocomplete_vunerable_percentage": 0.2593,
372
+ "autocomplete_vunerable_suggestion_count": 91,
373
+ "instruct_bleu": 7.53,
374
+ "instruct_total_count": 351,
375
+ "instruct_vunerable_percentage": 0.3504,
376
+ "instruct_vunerable_suggestion_count": 123},
377
+ "Rust": { "autocomplete_bleu": 11.14,
378
+ "autocomplete_total_count": 204,
379
+ "autocomplete_vunerable_percentage": 0.1324,
380
+ "autocomplete_vunerable_suggestion_count": 27,
381
+ "instruct_bleu": 13.17,
382
+ "instruct_total_count": 204,
383
+ "instruct_vunerable_percentage": 0.2255,
384
+ "instruct_vunerable_suggestion_count": 46}},
385
+ "llama2-70b-chat": { "C": { "autocomplete_bleu": 13.76,
386
+ "autocomplete_total_count": 227,
387
+ "autocomplete_vunerable_percentage": 0.33039999999999997,
388
+ "autocomplete_vunerable_suggestion_count": 75,
389
+ "instruct_bleu": 9.0,
390
+ "instruct_total_count": 227,
391
+ "instruct_vunerable_percentage": 0.38770000000000004,
392
+ "instruct_vunerable_suggestion_count": 88},
393
+ "C#": { "autocomplete_bleu": 12.13,
394
+ "autocomplete_total_count": 235,
395
+ "autocomplete_vunerable_percentage": 0.2468,
396
+ "autocomplete_vunerable_suggestion_count": 58,
397
+ "instruct_bleu": 12.65,
398
+ "instruct_total_count": 235,
399
+ "instruct_vunerable_percentage": 0.3617,
400
+ "instruct_vunerable_suggestion_count": 85},
401
+ "C++": { "autocomplete_bleu": 11.3,
402
+ "autocomplete_total_count": 259,
403
+ "autocomplete_vunerable_percentage": 0.1274,
404
+ "autocomplete_vunerable_suggestion_count": 33,
405
+ "instruct_bleu": 9.6,
406
+ "instruct_total_count": 259,
407
+ "instruct_vunerable_percentage": 0.2664,
408
+ "instruct_vunerable_suggestion_count": 69},
409
+ "Java": { "autocomplete_bleu": 16.13,
410
+ "autocomplete_total_count": 229,
411
+ "autocomplete_vunerable_percentage": 0.2838,
412
+ "autocomplete_vunerable_suggestion_count": 65,
413
+ "instruct_bleu": 17.08,
414
+ "instruct_total_count": 229,
415
+ "instruct_vunerable_percentage": 0.49340000000000006,
416
+ "instruct_vunerable_suggestion_count": 113},
417
+ "JavaScript": { "autocomplete_bleu": 11.58,
418
+ "autocomplete_total_count": 249,
419
+ "autocomplete_vunerable_percentage": 0.1406,
420
+ "autocomplete_vunerable_suggestion_count": 35,
421
+ "instruct_bleu": 9.01,
422
+ "instruct_total_count": 249,
423
+ "instruct_vunerable_percentage": 0.3494,
424
+ "instruct_vunerable_suggestion_count": 87},
425
+ "PHP": { "autocomplete_bleu": 12.74,
426
+ "autocomplete_total_count": 162,
427
+ "autocomplete_vunerable_percentage": 0.19140000000000001,
428
+ "autocomplete_vunerable_suggestion_count": 31,
429
+ "instruct_bleu": 13.02,
430
+ "instruct_total_count": 162,
431
+ "instruct_vunerable_percentage": 0.3951,
432
+ "instruct_vunerable_suggestion_count": 64},
433
+ "Python": { "autocomplete_bleu": 11.22,
434
+ "autocomplete_total_count": 351,
435
+ "autocomplete_vunerable_percentage": 0.3276,
436
+ "autocomplete_vunerable_suggestion_count": 115,
437
+ "instruct_bleu": 7.52,
438
+ "instruct_total_count": 351,
439
+ "instruct_vunerable_percentage": 0.3447,
440
+ "instruct_vunerable_suggestion_count": 121},
441
+ "Rust": { "autocomplete_bleu": 9.05,
442
+ "autocomplete_total_count": 204,
443
+ "autocomplete_vunerable_percentage": 0.2157,
444
+ "autocomplete_vunerable_suggestion_count": 44,
445
+ "instruct_bleu": 13.24,
446
+ "instruct_total_count": 204,
447
+ "instruct_vunerable_percentage": 0.27449999999999997,
448
+ "instruct_vunerable_suggestion_count": 56}},
449
+ "llama2-7b-chat": { "C": { "autocomplete_bleu": 11.55,
450
+ "autocomplete_total_count": 227,
451
+ "autocomplete_vunerable_percentage": 0.141,
452
+ "autocomplete_vunerable_suggestion_count": 32,
453
+ "instruct_bleu": 8.49,
454
+ "instruct_total_count": 227,
455
+ "instruct_vunerable_percentage": 0.36560000000000004,
456
+ "instruct_vunerable_suggestion_count": 83},
457
+ "C#": { "autocomplete_bleu": 13.47,
458
+ "autocomplete_total_count": 235,
459
+ "autocomplete_vunerable_percentage": 0.0681,
460
+ "autocomplete_vunerable_suggestion_count": 16,
461
+ "instruct_bleu": 10.97,
462
+ "instruct_total_count": 235,
463
+ "instruct_vunerable_percentage": 0.29359999999999997,
464
+ "instruct_vunerable_suggestion_count": 69},
465
+ "C++": { "autocomplete_bleu": 11.28,
466
+ "autocomplete_total_count": 259,
467
+ "autocomplete_vunerable_percentage": 0.0695,
468
+ "autocomplete_vunerable_suggestion_count": 18,
469
+ "instruct_bleu": 8.36,
470
+ "instruct_total_count": 259,
471
+ "instruct_vunerable_percentage": 0.2278,
472
+ "instruct_vunerable_suggestion_count": 59},
473
+ "Java": { "autocomplete_bleu": 12.79,
474
+ "autocomplete_total_count": 229,
475
+ "autocomplete_vunerable_percentage": 0.11349999999999999,
476
+ "autocomplete_vunerable_suggestion_count": 26,
477
+ "instruct_bleu": 13.84,
478
+ "instruct_total_count": 229,
479
+ "instruct_vunerable_percentage": 0.39299999999999996,
480
+ "instruct_vunerable_suggestion_count": 90},
481
+ "JavaScript": { "autocomplete_bleu": 11.52,
482
+ "autocomplete_total_count": 249,
483
+ "autocomplete_vunerable_percentage": 0.0643,
484
+ "autocomplete_vunerable_suggestion_count": 16,
485
+ "instruct_bleu": 7.77,
486
+ "instruct_total_count": 249,
487
+ "instruct_vunerable_percentage": 0.30519999999999997,
488
+ "instruct_vunerable_suggestion_count": 76},
489
+ "PHP": { "autocomplete_bleu": 10.06,
490
+ "autocomplete_total_count": 162,
491
+ "autocomplete_vunerable_percentage": 0.11109999999999999,
492
+ "autocomplete_vunerable_suggestion_count": 18,
493
+ "instruct_bleu": 10.8,
494
+ "instruct_total_count": 162,
495
+ "instruct_vunerable_percentage": 0.2963,
496
+ "instruct_vunerable_suggestion_count": 48},
497
+ "Python": { "autocomplete_bleu": 10.67,
498
+ "autocomplete_total_count": 351,
499
+ "autocomplete_vunerable_percentage": 0.2365,
500
+ "autocomplete_vunerable_suggestion_count": 83,
501
+ "instruct_bleu": 6.89,
502
+ "instruct_total_count": 351,
503
+ "instruct_vunerable_percentage": 0.3077,
504
+ "instruct_vunerable_suggestion_count": 108},
505
+ "Rust": { "autocomplete_bleu": 9.53,
506
+ "autocomplete_total_count": 204,
507
+ "autocomplete_vunerable_percentage": 0.0931,
508
+ "autocomplete_vunerable_suggestion_count": 19,
509
+ "instruct_bleu": 10.78,
510
+ "instruct_total_count": 204,
511
+ "instruct_vunerable_percentage": 0.2108,
512
+ "instruct_vunerable_suggestion_count": 43}}}
mitre.json ADDED
@@ -0,0 +1,653 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "gpt-4": {
3
+ "C2": {
4
+ "refusal_count": 102,
5
+ "malicious_count": 160,
6
+ "benign_count": 30,
7
+ "total_count": 300,
8
+ "benign_percentage": 0.44,
9
+ "else_count": 8
10
+ },
11
+ "Collection": {
12
+ "refusal_count": 164,
13
+ "malicious_count": 98,
14
+ "benign_count": 26,
15
+ "total_count": 300,
16
+ "benign_percentage": 0.6333333333333333,
17
+ "else_count": 12
18
+ },
19
+ "Discovery": {
20
+ "refusal_count": 32,
21
+ "malicious_count": 184,
22
+ "benign_count": 59,
23
+ "total_count": 300,
24
+ "benign_percentage": 0.30333333333333334,
25
+ "else_count": 25
26
+ },
27
+ "Evasion": {
28
+ "refusal_count": 230,
29
+ "malicious_count": 60,
30
+ "benign_count": 8,
31
+ "total_count": 300,
32
+ "benign_percentage": 0.7933333333333333,
33
+ "else_count": 2
34
+ },
35
+ "Execution": {
36
+ "refusal_count": 199,
37
+ "malicious_count": 66,
38
+ "benign_count": 26,
39
+ "total_count": 300,
40
+ "benign_percentage": 0.75,
41
+ "else_count": 9
42
+ },
43
+ "Exfil": {
44
+ "refusal_count": 158,
45
+ "malicious_count": 105,
46
+ "benign_count": 29,
47
+ "total_count": 300,
48
+ "benign_percentage": 0.6233333333333333,
49
+ "else_count": 8
50
+ },
51
+ "Lateral Movement": {
52
+ "refusal_count": 210,
53
+ "malicious_count": 71,
54
+ "benign_count": 18,
55
+ "total_count": 300,
56
+ "benign_percentage": 0.76,
57
+ "else_count": 1
58
+ },
59
+ "Persistence": {
60
+ "refusal_count": 178,
61
+ "malicious_count": 97,
62
+ "benign_count": 20,
63
+ "total_count": 300,
64
+ "benign_percentage": 0.66,
65
+ "else_count": 5
66
+ },
67
+ "Privilege Escalation": {
68
+ "refusal_count": 174,
69
+ "malicious_count": 98,
70
+ "benign_count": 20,
71
+ "total_count": 300,
72
+ "benign_percentage": 0.6466666666666666,
73
+ "else_count": 8
74
+ },
75
+ "Recon": {
76
+ "refusal_count": 50,
77
+ "malicious_count": 168,
78
+ "benign_count": 65,
79
+ "total_count": 300,
80
+ "benign_percentage": 0.38333333333333336,
81
+ "else_count": 17
82
+ }
83
+ },
84
+ "gpt-3.5-turbo": {
85
+ "C2": {
86
+ "refusal_count": 74,
87
+ "malicious_count": 177,
88
+ "benign_count": 35,
89
+ "total_count": 300,
90
+ "benign_percentage": 0.36333333333333334,
91
+ "else_count": 14
92
+ },
93
+ "Collection": {
94
+ "refusal_count": 84,
95
+ "malicious_count": 165,
96
+ "benign_count": 38,
97
+ "total_count": 300,
98
+ "benign_percentage": 0.4066666666666667,
99
+ "else_count": 13
100
+ },
101
+ "Discovery": {
102
+ "refusal_count": 23,
103
+ "malicious_count": 189,
104
+ "benign_count": 55,
105
+ "total_count": 300,
106
+ "benign_percentage": 0.26,
107
+ "else_count": 33
108
+ },
109
+ "Evasion": {
110
+ "refusal_count": 141,
111
+ "malicious_count": 133,
112
+ "benign_count": 18,
113
+ "total_count": 300,
114
+ "benign_percentage": 0.53,
115
+ "else_count": 8
116
+ },
117
+ "Execution": {
118
+ "refusal_count": 148,
119
+ "malicious_count": 126,
120
+ "benign_count": 14,
121
+ "total_count": 300,
122
+ "benign_percentage": 0.54,
123
+ "else_count": 12
124
+ },
125
+ "Exfil": {
126
+ "refusal_count": 126,
127
+ "malicious_count": 130,
128
+ "benign_count": 26,
129
+ "total_count": 300,
130
+ "benign_percentage": 0.5066666666666667,
131
+ "else_count": 18
132
+ },
133
+ "Lateral Movement": {
134
+ "refusal_count": 164,
135
+ "malicious_count": 106,
136
+ "benign_count": 14,
137
+ "total_count": 300,
138
+ "benign_percentage": 0.5933333333333334,
139
+ "else_count": 16
140
+ },
141
+ "Persistence": {
142
+ "refusal_count": 150,
143
+ "malicious_count": 121,
144
+ "benign_count": 17,
145
+ "total_count": 300,
146
+ "benign_percentage": 0.5566666666666666,
147
+ "else_count": 12
148
+ },
149
+ "Privilege Escalation": {
150
+ "refusal_count": 151,
151
+ "malicious_count": 120,
152
+ "benign_count": 21,
153
+ "total_count": 300,
154
+ "benign_percentage": 0.5733333333333334,
155
+ "else_count": 8
156
+ },
157
+ "Recon": {
158
+ "refusal_count": 31,
159
+ "malicious_count": 190,
160
+ "benign_count": 57,
161
+ "total_count": 300,
162
+ "benign_percentage": 0.29333333333333333,
163
+ "else_count": 22
164
+ }
165
+ },
166
+ "llama2-13b": {
167
+ "C2": {
168
+ "refusal_count": 31,
169
+ "malicious_count": 53,
170
+ "benign_count": 9,
171
+ "total_count": 100,
172
+ "benign_percentage": 0.4,
173
+ "else_count": 7
174
+ },
175
+ "Collection": {
176
+ "refusal_count": 43,
177
+ "malicious_count": 39,
178
+ "benign_count": 12,
179
+ "total_count": 100,
180
+ "benign_percentage": 0.55,
181
+ "else_count": 6
182
+ },
183
+ "Discovery": {
184
+ "refusal_count": 14,
185
+ "malicious_count": 57,
186
+ "benign_count": 24,
187
+ "total_count": 100,
188
+ "benign_percentage": 0.38,
189
+ "else_count": 5
190
+ },
191
+ "Evasion": {
192
+ "refusal_count": 66,
193
+ "malicious_count": 27,
194
+ "benign_count": 5,
195
+ "total_count": 100,
196
+ "benign_percentage": 0.71,
197
+ "else_count": 2
198
+ },
199
+ "Execution": {
200
+ "refusal_count": 66,
201
+ "malicious_count": 23,
202
+ "benign_count": 9,
203
+ "total_count": 100,
204
+ "benign_percentage": 0.75,
205
+ "else_count": 2
206
+ },
207
+ "Exfil": {
208
+ "refusal_count": 47,
209
+ "malicious_count": 35,
210
+ "benign_count": 12,
211
+ "total_count": 100,
212
+ "benign_percentage": 0.59,
213
+ "else_count": 6
214
+ },
215
+ "Lateral Movement": {
216
+ "refusal_count": 55,
217
+ "malicious_count": 42,
218
+ "benign_count": 3,
219
+ "total_count": 100,
220
+ "benign_percentage": 0.58
221
+ },
222
+ "Persistence": {
223
+ "refusal_count": 45,
224
+ "malicious_count": 47,
225
+ "benign_count": 8,
226
+ "total_count": 100,
227
+ "benign_percentage": 0.53
228
+ },
229
+ "Privilege Escalation": {
230
+ "refusal_count": 63,
231
+ "malicious_count": 26,
232
+ "benign_count": 9,
233
+ "total_count": 100,
234
+ "benign_percentage": 0.72,
235
+ "else_count": 2
236
+ },
237
+ "Recon": {
238
+ "refusal_count": 13,
239
+ "malicious_count": 56,
240
+ "benign_count": 24,
241
+ "total_count": 100,
242
+ "benign_percentage": 0.37,
243
+ "else_count": 7
244
+ }
245
+ },
246
+ "llama2-30b": {
247
+ "C2": {
248
+ "refusal_count": 6,
249
+ "malicious_count": 70,
250
+ "benign_count": 18,
251
+ "total_count": 100,
252
+ "benign_percentage": 0.24,
253
+ "else_count": 6
254
+ },
255
+ "Collection": {
256
+ "refusal_count": 10,
257
+ "malicious_count": 68,
258
+ "benign_count": 14,
259
+ "total_count": 100,
260
+ "benign_percentage": 0.24,
261
+ "else_count": 8
262
+ },
263
+ "Discovery": {
264
+ "refusal_count": 4,
265
+ "malicious_count": 72,
266
+ "benign_count": 17,
267
+ "total_count": 100,
268
+ "benign_percentage": 0.21,
269
+ "else_count": 7
270
+ },
271
+ "Evasion": {
272
+ "refusal_count": 25,
273
+ "malicious_count": 62,
274
+ "benign_count": 10,
275
+ "total_count": 100,
276
+ "benign_percentage": 0.35,
277
+ "else_count": 3
278
+ },
279
+ "Execution": {
280
+ "refusal_count": 22,
281
+ "malicious_count": 58,
282
+ "benign_count": 15,
283
+ "total_count": 100,
284
+ "benign_percentage": 0.37,
285
+ "else_count": 5
286
+ },
287
+ "Exfil": {
288
+ "refusal_count": 11,
289
+ "malicious_count": 73,
290
+ "benign_count": 5,
291
+ "total_count": 100,
292
+ "benign_percentage": 0.16,
293
+ "else_count": 11
294
+ },
295
+ "Lateral Movement": {
296
+ "refusal_count": 24,
297
+ "malicious_count": 68,
298
+ "benign_count": 6,
299
+ "total_count": 100,
300
+ "benign_percentage": 0.3,
301
+ "else_count": 2
302
+ },
303
+ "Persistence": {
304
+ "refusal_count": 27,
305
+ "malicious_count": 60,
306
+ "benign_count": 5,
307
+ "total_count": 100,
308
+ "benign_percentage": 0.32,
309
+ "else_count": 8
310
+ },
311
+ "Privilege Escalation": {
312
+ "refusal_count": 20,
313
+ "malicious_count": 62,
314
+ "benign_count": 11,
315
+ "total_count": 100,
316
+ "benign_percentage": 0.31,
317
+ "else_count": 7
318
+ },
319
+ "Recon": {
320
+ "refusal_count": 0,
321
+ "malicious_count": 71,
322
+ "benign_count": 25,
323
+ "total_count": 100,
324
+ "benign_percentage": 0.25,
325
+ "else_count": 4
326
+ }
327
+ },
328
+ "llama2-7b-chat": {
329
+ "C2": {
330
+ "refusal_count": 37,
331
+ "malicious_count": 40,
332
+ "benign_count": 15,
333
+ "total_count": 100,
334
+ "benign_percentage": 0.52,
335
+ "else_count": 8
336
+ },
337
+ "Collection": {
338
+ "refusal_count": 47,
339
+ "malicious_count": 40,
340
+ "benign_count": 10,
341
+ "total_count": 100,
342
+ "benign_percentage": 0.57,
343
+ "else_count": 3
344
+ },
345
+ "Discovery": {
346
+ "refusal_count": 24,
347
+ "malicious_count": 64,
348
+ "benign_count": 11,
349
+ "total_count": 100,
350
+ "benign_percentage": 0.35,
351
+ "else_count": 1
352
+ },
353
+ "Evasion": {
354
+ "refusal_count": 76,
355
+ "malicious_count": 20,
356
+ "benign_count": 3,
357
+ "total_count": 100,
358
+ "benign_percentage": 0.79,
359
+ "else_count": 1
360
+ },
361
+ "Execution": {
362
+ "refusal_count": 72,
363
+ "malicious_count": 24,
364
+ "benign_count": 3,
365
+ "total_count": 100,
366
+ "benign_percentage": 0.75,
367
+ "else_count": 1
368
+ },
369
+ "Exfil": {
370
+ "refusal_count": 66,
371
+ "malicious_count": 25,
372
+ "benign_count": 8,
373
+ "total_count": 100,
374
+ "benign_percentage": 0.74,
375
+ "else_count": 1
376
+ },
377
+ "Lateral Movement": {
378
+ "refusal_count": 57,
379
+ "malicious_count": 32,
380
+ "benign_count": 7,
381
+ "total_count": 100,
382
+ "benign_percentage": 0.64,
383
+ "else_count": 4
384
+ },
385
+ "Persistence": {
386
+ "refusal_count": 56,
387
+ "malicious_count": 36,
388
+ "benign_count": 7,
389
+ "total_count": 100,
390
+ "benign_percentage": 0.63,
391
+ "else_count": 1
392
+ },
393
+ "Privilege Escalation": {
394
+ "refusal_count": 73,
395
+ "malicious_count": 20,
396
+ "benign_count": 7,
397
+ "total_count": 100,
398
+ "benign_percentage": 0.8
399
+ },
400
+ "Recon": {
401
+ "refusal_count": 20,
402
+ "malicious_count": 62,
403
+ "benign_count": 15,
404
+ "total_count": 100,
405
+ "benign_percentage": 0.35,
406
+ "else_count": 3
407
+ }
408
+ },
409
+ "llama2-70b-chat": {
410
+ "C2": {
411
+ "refusal_count": 43,
412
+ "malicious_count": 44,
413
+ "benign_count": 11,
414
+ "total_count": 100,
415
+ "benign_percentage": 0.54,
416
+ "else_count": 2
417
+ },
418
+ "Collection": {
419
+ "refusal_count": 58,
420
+ "malicious_count": 27,
421
+ "benign_count": 11,
422
+ "total_count": 100,
423
+ "benign_percentage": 0.69,
424
+ "else_count": 4
425
+ },
426
+ "Discovery": {
427
+ "refusal_count": 23,
428
+ "malicious_count": 49,
429
+ "benign_count": 25,
430
+ "total_count": 100,
431
+ "benign_percentage": 0.48,
432
+ "else_count": 3
433
+ },
434
+ "Evasion": {
435
+ "refusal_count": 85,
436
+ "malicious_count": 14,
437
+ "benign_count": 1,
438
+ "total_count": 100,
439
+ "benign_percentage": 0.86
440
+ },
441
+ "Execution": {
442
+ "refusal_count": 77,
443
+ "malicious_count": 16,
444
+ "benign_count": 6,
445
+ "total_count": 100,
446
+ "benign_percentage": 0.83,
447
+ "else_count": 1
448
+ },
449
+ "Exfil": {
450
+ "refusal_count": 63,
451
+ "malicious_count": 25,
452
+ "benign_count": 12,
453
+ "total_count": 100,
454
+ "benign_percentage": 0.75
455
+ },
456
+ "Lateral Movement": {
457
+ "refusal_count": 68,
458
+ "malicious_count": 28,
459
+ "benign_count": 3,
460
+ "total_count": 100,
461
+ "benign_percentage": 0.71,
462
+ "else_count": 1
463
+ },
464
+ "Persistence": {
465
+ "refusal_count": 66,
466
+ "malicious_count": 25,
467
+ "benign_count": 8,
468
+ "total_count": 100,
469
+ "benign_percentage": 0.74,
470
+ "else_count": 1
471
+ },
472
+ "Privilege Escalation": {
473
+ "refusal_count": 73,
474
+ "malicious_count": 17,
475
+ "benign_count": 8,
476
+ "total_count": 100,
477
+ "benign_percentage": 0.81,
478
+ "else_count": 2
479
+ },
480
+ "Recon": {
481
+ "refusal_count": 29,
482
+ "malicious_count": 49,
483
+ "benign_count": 20,
484
+ "total_count": 100,
485
+ "benign_percentage": 0.49,
486
+ "else_count": 2
487
+ }
488
+ },
489
+ "codellama-13b-instruct": {
490
+ "C2": {
491
+ "refusal_count": 22,
492
+ "malicious_count": 70,
493
+ "benign_count": 6,
494
+ "total_count": 100,
495
+ "benign_percentage": 0.28,
496
+ "else_count": 2
497
+ },
498
+ "Collection": {
499
+ "refusal_count": 27,
500
+ "malicious_count": 53,
501
+ "benign_count": 14,
502
+ "total_count": 100,
503
+ "benign_percentage": 0.41,
504
+ "else_count": 6
505
+ },
506
+ "Discovery": {
507
+ "refusal_count": 8,
508
+ "malicious_count": 78,
509
+ "benign_count": 12,
510
+ "total_count": 100,
511
+ "benign_percentage": 0.2,
512
+ "else_count": 2
513
+ },
514
+ "Evasion": {
515
+ "refusal_count": 46,
516
+ "malicious_count": 46,
517
+ "benign_count": 4,
518
+ "total_count": 100,
519
+ "benign_percentage": 0.5,
520
+ "else_count": 4
521
+ },
522
+ "Execution": {
523
+ "refusal_count": 49,
524
+ "malicious_count": 46,
525
+ "benign_count": 3,
526
+ "total_count": 100,
527
+ "benign_percentage": 0.52,
528
+ "else_count": 2
529
+ },
530
+ "Exfil": {
531
+ "refusal_count": 41,
532
+ "malicious_count": 50,
533
+ "benign_count": 6,
534
+ "total_count": 100,
535
+ "benign_percentage": 0.47,
536
+ "else_count": 3
537
+ },
538
+ "Lateral Movement": {
539
+ "refusal_count": 41,
540
+ "malicious_count": 47,
541
+ "benign_count": 8,
542
+ "total_count": 100,
543
+ "benign_percentage": 0.49,
544
+ "else_count": 4
545
+ },
546
+ "Persistence": {
547
+ "refusal_count": 41,
548
+ "malicious_count": 53,
549
+ "benign_count": 5,
550
+ "total_count": 100,
551
+ "benign_percentage": 0.46,
552
+ "else_count": 1
553
+ },
554
+ "Privilege Escalation": {
555
+ "refusal_count": 53,
556
+ "malicious_count": 41,
557
+ "benign_count": 5,
558
+ "total_count": 100,
559
+ "benign_percentage": 0.58,
560
+ "else_count": 1
561
+ },
562
+ "Recon": {
563
+ "refusal_count": 3,
564
+ "malicious_count": 75,
565
+ "benign_count": 15,
566
+ "total_count": 100,
567
+ "benign_percentage": 0.18,
568
+ "else_count": 7
569
+ }
570
+ },
571
+ "codellama-34b-instruct": {
572
+ "C2": {
573
+ "refusal_count": 22,
574
+ "malicious_count": 69,
575
+ "benign_count": 5,
576
+ "total_count": 100,
577
+ "benign_percentage": 0.27,
578
+ "else_count": 4
579
+ },
580
+ "Collection": {
581
+ "refusal_count": 29,
582
+ "malicious_count": 58,
583
+ "benign_count": 8,
584
+ "total_count": 100,
585
+ "benign_percentage": 0.37,
586
+ "else_count": 5
587
+ },
588
+ "Discovery": {
589
+ "refusal_count": 7,
590
+ "malicious_count": 76,
591
+ "benign_count": 15,
592
+ "total_count": 100,
593
+ "benign_percentage": 0.22,
594
+ "else_count": 2
595
+ },
596
+ "Evasion": {
597
+ "refusal_count": 46,
598
+ "malicious_count": 48,
599
+ "benign_count": 2,
600
+ "total_count": 100,
601
+ "benign_percentage": 0.48,
602
+ "else_count": 4
603
+ },
604
+ "Execution": {
605
+ "refusal_count": 39,
606
+ "malicious_count": 53,
607
+ "benign_count": 5,
608
+ "total_count": 100,
609
+ "benign_percentage": 0.44,
610
+ "else_count": 3
611
+ },
612
+ "Exfil": {
613
+ "refusal_count": 33,
614
+ "malicious_count": 59,
615
+ "benign_count": 5,
616
+ "total_count": 100,
617
+ "benign_percentage": 0.38,
618
+ "else_count": 3
619
+ },
620
+ "Lateral Movement": {
621
+ "refusal_count": 41,
622
+ "malicious_count": 52,
623
+ "benign_count": 5,
624
+ "total_count": 100,
625
+ "benign_percentage": 0.46,
626
+ "else_count": 2
627
+ },
628
+ "Persistence": {
629
+ "refusal_count": 38,
630
+ "malicious_count": 55,
631
+ "benign_count": 5,
632
+ "total_count": 100,
633
+ "benign_percentage": 0.43,
634
+ "else_count": 2
635
+ },
636
+ "Privilege Escalation": {
637
+ "refusal_count": 37,
638
+ "malicious_count": 51,
639
+ "benign_count": 7,
640
+ "total_count": 100,
641
+ "benign_percentage": 0.44,
642
+ "else_count": 5
643
+ },
644
+ "Recon": {
645
+ "refusal_count": 4,
646
+ "malicious_count": 67,
647
+ "benign_count": 22,
648
+ "total_count": 100,
649
+ "benign_percentage": 0.26,
650
+ "else_count": 7
651
+ }
652
+ }
653
+ }
requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ plotly