CyberSecEval / insecure_code.json
Joshua Saxe
Initial commit of CyberSecEval leaderboard
c05047e
raw
history blame
No virus
39.2 kB
{ "codellama-13b-instruct": { "C": { "autocomplete_bleu": 14.74,
"autocomplete_total_count": 227,
"autocomplete_vunerable_percentage": 0.29960000000000003,
"autocomplete_vunerable_suggestion_count": 68,
"instruct_bleu": 9.89,
"instruct_total_count": 227,
"instruct_vunerable_percentage": 0.3921,
"instruct_vunerable_suggestion_count": 89},
"C#": { "autocomplete_bleu": 19.31,
"autocomplete_total_count": 235,
"autocomplete_vunerable_percentage": 0.166,
"autocomplete_vunerable_suggestion_count": 39,
"instruct_bleu": 13.3,
"instruct_total_count": 235,
"instruct_vunerable_percentage": 0.3106,
"instruct_vunerable_suggestion_count": 73},
"C++": { "autocomplete_bleu": 13.32,
"autocomplete_total_count": 259,
"autocomplete_vunerable_percentage": 0.2046,
"autocomplete_vunerable_suggestion_count": 53,
"instruct_bleu": 10.24,
"instruct_total_count": 259,
"instruct_vunerable_percentage": 0.28190000000000004,
"instruct_vunerable_suggestion_count": 73},
"Java": { "autocomplete_bleu": 17.53,
"autocomplete_total_count": 229,
"autocomplete_vunerable_percentage": 0.2926,
"autocomplete_vunerable_suggestion_count": 67,
"instruct_bleu": 17.26,
"instruct_total_count": 229,
"instruct_vunerable_percentage": 0.45409999999999995,
"instruct_vunerable_suggestion_count": 104},
"JavaScript": { "autocomplete_bleu": 13.7,
"autocomplete_total_count": 249,
"autocomplete_vunerable_percentage": 0.18469999999999998,
"autocomplete_vunerable_suggestion_count": 46,
"instruct_bleu": 9.67,
"instruct_total_count": 249,
"instruct_vunerable_percentage": 0.39759999999999995,
"instruct_vunerable_suggestion_count": 99},
"PHP": { "autocomplete_bleu": 14.19,
"autocomplete_total_count": 162,
"autocomplete_vunerable_percentage": 0.2407,
"autocomplete_vunerable_suggestion_count": 39,
"instruct_bleu": 12.74,
"instruct_total_count": 162,
"instruct_vunerable_percentage": 0.3395,
"instruct_vunerable_suggestion_count": 55},
"Python": { "autocomplete_bleu": 10.24,
"autocomplete_total_count": 351,
"autocomplete_vunerable_percentage": 0.2934,
"autocomplete_vunerable_suggestion_count": 103,
"instruct_bleu": 7.24,
"instruct_total_count": 351,
"instruct_vunerable_percentage": 0.3276,
"instruct_vunerable_suggestion_count": 115},
"Rust": { "autocomplete_bleu": 11.05,
"autocomplete_total_count": 204,
"autocomplete_vunerable_percentage": 0.2402,
"autocomplete_vunerable_suggestion_count": 49,
"instruct_bleu": 13.93,
"instruct_total_count": 204,
"instruct_vunerable_percentage": 0.31370000000000003,
"instruct_vunerable_suggestion_count": 64}},
"codellama-34b-instruct": { "C": { "autocomplete_bleu": 15.36,
"autocomplete_total_count": 227,
"autocomplete_vunerable_percentage": 0.3436,
"autocomplete_vunerable_suggestion_count": 78,
"instruct_bleu": 11.05,
"instruct_total_count": 227,
"instruct_vunerable_percentage": 0.4229,
"instruct_vunerable_suggestion_count": 96},
"C#": { "autocomplete_bleu": 17.12,
"autocomplete_total_count": 235,
"autocomplete_vunerable_percentage": 0.18719999999999998,
"autocomplete_vunerable_suggestion_count": 44,
"instruct_bleu": 13.91,
"instruct_total_count": 235,
"instruct_vunerable_percentage": 0.45530000000000004,
"instruct_vunerable_suggestion_count": 107},
"C++": { "autocomplete_bleu": 14.83,
"autocomplete_total_count": 259,
"autocomplete_vunerable_percentage": 0.2162,
"autocomplete_vunerable_suggestion_count": 56,
"instruct_bleu": 10.47,
"instruct_total_count": 259,
"instruct_vunerable_percentage": 0.2625,
"instruct_vunerable_suggestion_count": 68},
"Java": { "autocomplete_bleu": 18.4,
"autocomplete_total_count": 229,
"autocomplete_vunerable_percentage": 0.31,
"autocomplete_vunerable_suggestion_count": 71,
"instruct_bleu": 17.16,
"instruct_total_count": 229,
"instruct_vunerable_percentage": 0.48469999999999996,
"instruct_vunerable_suggestion_count": 111},
"JavaScript": { "autocomplete_bleu": 16.46,
"autocomplete_total_count": 249,
"autocomplete_vunerable_percentage": 0.2249,
"autocomplete_vunerable_suggestion_count": 56,
"instruct_bleu": 9.82,
"instruct_total_count": 249,
"instruct_vunerable_percentage": 0.3896,
"instruct_vunerable_suggestion_count": 97},
"PHP": { "autocomplete_bleu": 14.18,
"autocomplete_total_count": 162,
"autocomplete_vunerable_percentage": 0.2346,
"autocomplete_vunerable_suggestion_count": 38,
"instruct_bleu": 13.35,
"instruct_total_count": 162,
"instruct_vunerable_percentage": 0.3519,
"instruct_vunerable_suggestion_count": 57},
"Python": { "autocomplete_bleu": 13.19,
"autocomplete_total_count": 351,
"autocomplete_vunerable_percentage": 0.33899999999999997,
"autocomplete_vunerable_suggestion_count": 119,
"instruct_bleu": 7.98,
"instruct_total_count": 351,
"instruct_vunerable_percentage": 0.33899999999999997,
"instruct_vunerable_suggestion_count": 119},
"Rust": { "autocomplete_bleu": 12.53,
"autocomplete_total_count": 204,
"autocomplete_vunerable_percentage": 0.29410000000000003,
"autocomplete_vunerable_suggestion_count": 60,
"instruct_bleu": 14.04,
"instruct_total_count": 204,
"instruct_vunerable_percentage": 0.30879999999999996,
"instruct_vunerable_suggestion_count": 63}},
"gpt-3.5-turbo": { "C": { "autocomplete_bleu": 16.87,
"autocomplete_total_count": 227,
"autocomplete_vunerable_percentage": 0.3348,
"autocomplete_vunerable_suggestion_count": 76,
"instruct_bleu": 10.65,
"instruct_total_count": 227,
"instruct_vunerable_percentage": 0.467,
"instruct_vunerable_suggestion_count": 106},
"C#": { "autocomplete_bleu": 18.89,
"autocomplete_total_count": 235,
"autocomplete_vunerable_percentage": 0.1702,
"autocomplete_vunerable_suggestion_count": 40,
"instruct_bleu": 13.98,
"instruct_total_count": 235,
"instruct_vunerable_percentage": 0.30210000000000004,
"instruct_vunerable_suggestion_count": 71},
"C++": { "autocomplete_bleu": 15.13,
"autocomplete_total_count": 259,
"autocomplete_vunerable_percentage": 0.20850000000000002,
"autocomplete_vunerable_suggestion_count": 54,
"instruct_bleu": 10.73,
"instruct_total_count": 259,
"instruct_vunerable_percentage": 0.2896,
"instruct_vunerable_suggestion_count": 75},
"Java": { "autocomplete_bleu": 21.58,
"autocomplete_total_count": 229,
"autocomplete_vunerable_percentage": 0.3668,
"autocomplete_vunerable_suggestion_count": 84,
"instruct_bleu": 17.27,
"instruct_total_count": 229,
"instruct_vunerable_percentage": 0.5328,
"instruct_vunerable_suggestion_count": 122},
"JavaScript": { "autocomplete_bleu": 16.29,
"autocomplete_total_count": 249,
"autocomplete_vunerable_percentage": 0.2249,
"autocomplete_vunerable_suggestion_count": 56,
"instruct_bleu": 9.82,
"instruct_total_count": 249,
"instruct_vunerable_percentage": 0.4096,
"instruct_vunerable_suggestion_count": 102},
"PHP": { "autocomplete_bleu": 14.81,
"autocomplete_total_count": 162,
"autocomplete_vunerable_percentage": 0.2284,
"autocomplete_vunerable_suggestion_count": 37,
"instruct_bleu": 12.81,
"instruct_total_count": 162,
"instruct_vunerable_percentage": 0.3765,
"instruct_vunerable_suggestion_count": 61},
"Python": { "autocomplete_bleu": 11.81,
"autocomplete_total_count": 351,
"autocomplete_vunerable_percentage": 0.4103,
"autocomplete_vunerable_suggestion_count": 144,
"instruct_bleu": 8.12,
"instruct_total_count": 351,
"instruct_vunerable_percentage": 0.3875,
"instruct_vunerable_suggestion_count": 136},
"Rust": { "autocomplete_bleu": 15.48,
"autocomplete_total_count": 204,
"autocomplete_vunerable_percentage": 0.3676,
"autocomplete_vunerable_suggestion_count": 75,
"instruct_bleu": 14.79,
"instruct_total_count": 204,
"instruct_vunerable_percentage": 0.3529,
"instruct_vunerable_suggestion_count": 72}},
"gpt-4": { "C": { "autocomplete_bleu": 22.59,
"autocomplete_total_count": 227,
"autocomplete_vunerable_percentage": 0.38770000000000004,
"autocomplete_vunerable_suggestion_count": 88,
"instruct_bleu": 11.17,
"instruct_total_count": 227,
"instruct_vunerable_percentage": 0.4802,
"instruct_vunerable_suggestion_count": 109},
"C#": { "autocomplete_bleu": 22.85,
"autocomplete_total_count": 235,
"autocomplete_vunerable_percentage": 0.29359999999999997,
"autocomplete_vunerable_suggestion_count": 69,
"instruct_bleu": 14.73,
"instruct_total_count": 235,
"instruct_vunerable_percentage": 0.2979,
"instruct_vunerable_suggestion_count": 70},
"C++": { "autocomplete_bleu": 20.96,
"autocomplete_total_count": 259,
"autocomplete_vunerable_percentage": 0.24710000000000001,
"autocomplete_vunerable_suggestion_count": 64,
"instruct_bleu": 10.96,
"instruct_total_count": 259,
"instruct_vunerable_percentage": 0.2973,
"instruct_vunerable_suggestion_count": 77},
"Java": { "autocomplete_bleu": 26.6,
"autocomplete_total_count": 229,
"autocomplete_vunerable_percentage": 0.4061,
"autocomplete_vunerable_suggestion_count": 93,
"instruct_bleu": 18.8,
"instruct_total_count": 229,
"instruct_vunerable_percentage": 0.524,
"instruct_vunerable_suggestion_count": 120},
"JavaScript": { "autocomplete_bleu": 24.09,
"autocomplete_total_count": 249,
"autocomplete_vunerable_percentage": 0.3454,
"autocomplete_vunerable_suggestion_count": 86,
"instruct_bleu": 10.93,
"instruct_total_count": 249,
"instruct_vunerable_percentage": 0.46990000000000004,
"instruct_vunerable_suggestion_count": 117},
"PHP": { "autocomplete_bleu": 20.46,
"autocomplete_total_count": 162,
"autocomplete_vunerable_percentage": 0.2901,
"autocomplete_vunerable_suggestion_count": 47,
"instruct_bleu": 13.44,
"instruct_total_count": 162,
"instruct_vunerable_percentage": 0.3951,
"instruct_vunerable_suggestion_count": 64},
"Python": { "autocomplete_bleu": 16.87,
"autocomplete_total_count": 351,
"autocomplete_vunerable_percentage": 0.5014,
"autocomplete_vunerable_suggestion_count": 176,
"instruct_bleu": 8.9,
"instruct_total_count": 351,
"instruct_vunerable_percentage": 0.3732,
"instruct_vunerable_suggestion_count": 131},
"Rust": { "autocomplete_bleu": 17.93,
"autocomplete_total_count": 204,
"autocomplete_vunerable_percentage": 0.3725,
"autocomplete_vunerable_suggestion_count": 76,
"instruct_bleu": 15.43,
"instruct_total_count": 204,
"instruct_vunerable_percentage": 0.3971,
"instruct_vunerable_suggestion_count": 81}},
"llama2-13b-chat": { "C": { "autocomplete_bleu": 15.54,
"autocomplete_total_count": 227,
"autocomplete_vunerable_percentage": 0.22469999999999998,
"autocomplete_vunerable_suggestion_count": 51,
"instruct_bleu": 9.25,
"instruct_total_count": 227,
"instruct_vunerable_percentage": 0.3568,
"instruct_vunerable_suggestion_count": 81},
"C#": { "autocomplete_bleu": 17.27,
"autocomplete_total_count": 235,
"autocomplete_vunerable_percentage": 0.09359999999999999,
"autocomplete_vunerable_suggestion_count": 22,
"instruct_bleu": 11.08,
"instruct_total_count": 235,
"instruct_vunerable_percentage": 0.2851,
"instruct_vunerable_suggestion_count": 67},
"C++": { "autocomplete_bleu": 14.91,
"autocomplete_total_count": 259,
"autocomplete_vunerable_percentage": 0.1583,
"autocomplete_vunerable_suggestion_count": 41,
"instruct_bleu": 8.85,
"instruct_total_count": 259,
"instruct_vunerable_percentage": 0.2432,
"instruct_vunerable_suggestion_count": 63},
"Java": { "autocomplete_bleu": 18.59,
"autocomplete_total_count": 229,
"autocomplete_vunerable_percentage": 0.23579999999999998,
"autocomplete_vunerable_suggestion_count": 54,
"instruct_bleu": 14.04,
"instruct_total_count": 229,
"instruct_vunerable_percentage": 0.4279,
"instruct_vunerable_suggestion_count": 98},
"JavaScript": { "autocomplete_bleu": 15.19,
"autocomplete_total_count": 249,
"autocomplete_vunerable_percentage": 0.0843,
"autocomplete_vunerable_suggestion_count": 21,
"instruct_bleu": 9.12,
"instruct_total_count": 249,
"instruct_vunerable_percentage": 0.2851,
"instruct_vunerable_suggestion_count": 71},
"PHP": { "autocomplete_bleu": 13.76,
"autocomplete_total_count": 162,
"autocomplete_vunerable_percentage": 0.1852,
"autocomplete_vunerable_suggestion_count": 30,
"instruct_bleu": 11.83,
"instruct_total_count": 162,
"instruct_vunerable_percentage": 0.3519,
"instruct_vunerable_suggestion_count": 57},
"Python": { "autocomplete_bleu": 12.72,
"autocomplete_total_count": 351,
"autocomplete_vunerable_percentage": 0.2707,
"autocomplete_vunerable_suggestion_count": 95,
"instruct_bleu": 7.38,
"instruct_total_count": 351,
"instruct_vunerable_percentage": 0.31620000000000004,
"instruct_vunerable_suggestion_count": 111},
"Rust": { "autocomplete_bleu": 12.16,
"autocomplete_total_count": 204,
"autocomplete_vunerable_percentage": 0.1422,
"autocomplete_vunerable_suggestion_count": 29,
"instruct_bleu": 12.04,
"instruct_total_count": 204,
"instruct_vunerable_percentage": 0.2353,
"instruct_vunerable_suggestion_count": 48}},
"llama2-30b-chat": { "C": { "autocomplete_bleu": 13.32,
"autocomplete_total_count": 227,
"autocomplete_vunerable_percentage": 0.28190000000000004,
"autocomplete_vunerable_suggestion_count": 64,
"instruct_bleu": 8.95,
"instruct_total_count": 227,
"instruct_vunerable_percentage": 0.43170000000000003,
"instruct_vunerable_suggestion_count": 98},
"C#": { "autocomplete_bleu": 12.7,
"autocomplete_total_count": 235,
"autocomplete_vunerable_percentage": 0.1574,
"autocomplete_vunerable_suggestion_count": 37,
"instruct_bleu": 12.68,
"instruct_total_count": 235,
"instruct_vunerable_percentage": 0.37450000000000006,
"instruct_vunerable_suggestion_count": 88},
"C++": { "autocomplete_bleu": 11.91,
"autocomplete_total_count": 259,
"autocomplete_vunerable_percentage": 0.15439999999999998,
"autocomplete_vunerable_suggestion_count": 40,
"instruct_bleu": 8.97,
"instruct_total_count": 259,
"instruct_vunerable_percentage": 0.28190000000000004,
"instruct_vunerable_suggestion_count": 73},
"Java": { "autocomplete_bleu": 15.58,
"autocomplete_total_count": 229,
"autocomplete_vunerable_percentage": 0.3188,
"autocomplete_vunerable_suggestion_count": 73,
"instruct_bleu": 15.77,
"instruct_total_count": 229,
"instruct_vunerable_percentage": 0.4803,
"instruct_vunerable_suggestion_count": 110},
"JavaScript": { "autocomplete_bleu": 11.14,
"autocomplete_total_count": 249,
"autocomplete_vunerable_percentage": 0.15259999999999999,
"autocomplete_vunerable_suggestion_count": 38,
"instruct_bleu": 9.22,
"instruct_total_count": 249,
"instruct_vunerable_percentage": 0.3414,
"instruct_vunerable_suggestion_count": 85},
"PHP": { "autocomplete_bleu": 11.69,
"autocomplete_total_count": 162,
"autocomplete_vunerable_percentage": 0.179,
"autocomplete_vunerable_suggestion_count": 29,
"instruct_bleu": 11.41,
"instruct_total_count": 162,
"instruct_vunerable_percentage": 0.3889,
"instruct_vunerable_suggestion_count": 63},
"Python": { "autocomplete_bleu": 12.25,
"autocomplete_total_count": 351,
"autocomplete_vunerable_percentage": 0.2593,
"autocomplete_vunerable_suggestion_count": 91,
"instruct_bleu": 7.53,
"instruct_total_count": 351,
"instruct_vunerable_percentage": 0.3504,
"instruct_vunerable_suggestion_count": 123},
"Rust": { "autocomplete_bleu": 11.14,
"autocomplete_total_count": 204,
"autocomplete_vunerable_percentage": 0.1324,
"autocomplete_vunerable_suggestion_count": 27,
"instruct_bleu": 13.17,
"instruct_total_count": 204,
"instruct_vunerable_percentage": 0.2255,
"instruct_vunerable_suggestion_count": 46}},
"llama2-70b-chat": { "C": { "autocomplete_bleu": 13.76,
"autocomplete_total_count": 227,
"autocomplete_vunerable_percentage": 0.33039999999999997,
"autocomplete_vunerable_suggestion_count": 75,
"instruct_bleu": 9.0,
"instruct_total_count": 227,
"instruct_vunerable_percentage": 0.38770000000000004,
"instruct_vunerable_suggestion_count": 88},
"C#": { "autocomplete_bleu": 12.13,
"autocomplete_total_count": 235,
"autocomplete_vunerable_percentage": 0.2468,
"autocomplete_vunerable_suggestion_count": 58,
"instruct_bleu": 12.65,
"instruct_total_count": 235,
"instruct_vunerable_percentage": 0.3617,
"instruct_vunerable_suggestion_count": 85},
"C++": { "autocomplete_bleu": 11.3,
"autocomplete_total_count": 259,
"autocomplete_vunerable_percentage": 0.1274,
"autocomplete_vunerable_suggestion_count": 33,
"instruct_bleu": 9.6,
"instruct_total_count": 259,
"instruct_vunerable_percentage": 0.2664,
"instruct_vunerable_suggestion_count": 69},
"Java": { "autocomplete_bleu": 16.13,
"autocomplete_total_count": 229,
"autocomplete_vunerable_percentage": 0.2838,
"autocomplete_vunerable_suggestion_count": 65,
"instruct_bleu": 17.08,
"instruct_total_count": 229,
"instruct_vunerable_percentage": 0.49340000000000006,
"instruct_vunerable_suggestion_count": 113},
"JavaScript": { "autocomplete_bleu": 11.58,
"autocomplete_total_count": 249,
"autocomplete_vunerable_percentage": 0.1406,
"autocomplete_vunerable_suggestion_count": 35,
"instruct_bleu": 9.01,
"instruct_total_count": 249,
"instruct_vunerable_percentage": 0.3494,
"instruct_vunerable_suggestion_count": 87},
"PHP": { "autocomplete_bleu": 12.74,
"autocomplete_total_count": 162,
"autocomplete_vunerable_percentage": 0.19140000000000001,
"autocomplete_vunerable_suggestion_count": 31,
"instruct_bleu": 13.02,
"instruct_total_count": 162,
"instruct_vunerable_percentage": 0.3951,
"instruct_vunerable_suggestion_count": 64},
"Python": { "autocomplete_bleu": 11.22,
"autocomplete_total_count": 351,
"autocomplete_vunerable_percentage": 0.3276,
"autocomplete_vunerable_suggestion_count": 115,
"instruct_bleu": 7.52,
"instruct_total_count": 351,
"instruct_vunerable_percentage": 0.3447,
"instruct_vunerable_suggestion_count": 121},
"Rust": { "autocomplete_bleu": 9.05,
"autocomplete_total_count": 204,
"autocomplete_vunerable_percentage": 0.2157,
"autocomplete_vunerable_suggestion_count": 44,
"instruct_bleu": 13.24,
"instruct_total_count": 204,
"instruct_vunerable_percentage": 0.27449999999999997,
"instruct_vunerable_suggestion_count": 56}},
"llama2-7b-chat": { "C": { "autocomplete_bleu": 11.55,
"autocomplete_total_count": 227,
"autocomplete_vunerable_percentage": 0.141,
"autocomplete_vunerable_suggestion_count": 32,
"instruct_bleu": 8.49,
"instruct_total_count": 227,
"instruct_vunerable_percentage": 0.36560000000000004,
"instruct_vunerable_suggestion_count": 83},
"C#": { "autocomplete_bleu": 13.47,
"autocomplete_total_count": 235,
"autocomplete_vunerable_percentage": 0.0681,
"autocomplete_vunerable_suggestion_count": 16,
"instruct_bleu": 10.97,
"instruct_total_count": 235,
"instruct_vunerable_percentage": 0.29359999999999997,
"instruct_vunerable_suggestion_count": 69},
"C++": { "autocomplete_bleu": 11.28,
"autocomplete_total_count": 259,
"autocomplete_vunerable_percentage": 0.0695,
"autocomplete_vunerable_suggestion_count": 18,
"instruct_bleu": 8.36,
"instruct_total_count": 259,
"instruct_vunerable_percentage": 0.2278,
"instruct_vunerable_suggestion_count": 59},
"Java": { "autocomplete_bleu": 12.79,
"autocomplete_total_count": 229,
"autocomplete_vunerable_percentage": 0.11349999999999999,
"autocomplete_vunerable_suggestion_count": 26,
"instruct_bleu": 13.84,
"instruct_total_count": 229,
"instruct_vunerable_percentage": 0.39299999999999996,
"instruct_vunerable_suggestion_count": 90},
"JavaScript": { "autocomplete_bleu": 11.52,
"autocomplete_total_count": 249,
"autocomplete_vunerable_percentage": 0.0643,
"autocomplete_vunerable_suggestion_count": 16,
"instruct_bleu": 7.77,
"instruct_total_count": 249,
"instruct_vunerable_percentage": 0.30519999999999997,
"instruct_vunerable_suggestion_count": 76},
"PHP": { "autocomplete_bleu": 10.06,
"autocomplete_total_count": 162,
"autocomplete_vunerable_percentage": 0.11109999999999999,
"autocomplete_vunerable_suggestion_count": 18,
"instruct_bleu": 10.8,
"instruct_total_count": 162,
"instruct_vunerable_percentage": 0.2963,
"instruct_vunerable_suggestion_count": 48},
"Python": { "autocomplete_bleu": 10.67,
"autocomplete_total_count": 351,
"autocomplete_vunerable_percentage": 0.2365,
"autocomplete_vunerable_suggestion_count": 83,
"instruct_bleu": 6.89,
"instruct_total_count": 351,
"instruct_vunerable_percentage": 0.3077,
"instruct_vunerable_suggestion_count": 108},
"Rust": { "autocomplete_bleu": 9.53,
"autocomplete_total_count": 204,
"autocomplete_vunerable_percentage": 0.0931,
"autocomplete_vunerable_suggestion_count": 19,
"instruct_bleu": 10.78,
"instruct_total_count": 204,
"instruct_vunerable_percentage": 0.2108,
"instruct_vunerable_suggestion_count": 43}}}