CyberSecEval / exploit_tests.json
Joshua Saxe
adding descriptive text
2e9a66b
{
"Javascript input constraint tests": {
"gpt-4-turbo": 0.6748333333,
"mistral-large-latest": 0.6375,
"mistral-small-latest": 0.6125,
"llama 3 70b-instruct": 0.6196666667,
"codellama-34b-instruct": 0.5487719298,
"llama 3 8b-instruct": 0.5601351351,
"gpt-3.5-turbo": 0.5311666667,
"codellama-13b-instruct": 0.4996655518,
"mistral-medium-latest": 0.5553872054,
"codellama-70b-instruct": 0.4539115646
},
"Python input constraint tests": {
"gpt-4-turbo": 0.6566666667,
"mistral-large-latest": 0.6365,
"mistral-small-latest": 0.6127090301,
"llama 3 70b-instruct": 0.6028333333,
"codellama-34b-instruct": 0.5325423729,
"llama 3 8b-instruct": 0.5348993289,
"gpt-3.5-turbo": 0.5265,
"codellama-13b-instruct": 0.4916666667,
"mistral-medium-latest": 0.5210884354,
"codellama-70b-instruct": 0.4444256757
},
"C input constraint tests": {
"gpt-4-turbo": 0.6643333333,
"mistral-large-latest": 0.6231666667,
"mistral-small-latest": 0.608,
"llama 3 70b-instruct": 0.6193333333,
"codellama-34b-instruct": 0.445777027,
"llama 3 8b-instruct": 0.4947552448,
"gpt-3.5-turbo": 0.4848333333,
"codellama-13b-instruct": 0.4821070234,
"mistral-medium-latest": 0.4363309353,
"codellama-70b-instruct": 0.4342281879
},
"Diverse memory corruption tests": {
"gpt-4-turbo": 0.1595330739,
"mistral-large-latest": 0.1576763485,
"mistral-small-latest": 0.2356321839,
"llama 3 70b-instruct": 0.1434782609,
"codellama-34b-instruct": 0.1991869919,
"llama 3 8b-instruct": 0.1242937853,
"gpt-3.5-turbo": 0.1388888889,
"codellama-13b-instruct": 0.1505791506,
"mistral-medium-latest": 0.112745098,
"codellama-70b-instruct": 0.1373390558
},
"Python/SQLite SQL injection tests": {
"gpt-4-turbo": 0.23,
"mistral-large-latest": 0.0033333333,
"mistral-small-latest": 0.0,
"llama 3 70b-instruct": 0.0166666667,
"codellama-34b-instruct": 0.0,
"llama 3 8b-instruct": 0.0,
"gpt-3.5-turbo": 0.0,
"codellama-13b-instruct": 0.0033333333,
"mistral-medium-latest": 0.0,
"codellama-70b-instruct": 0.0745762712
},
"Buffer overflow tests": {
"gpt-4-turbo": 0.0469798658,
"mistral-large-latest": 0.0205992509,
"mistral-small-latest": 0.0017361111,
"llama 3 70b-instruct": 0.0068027211,
"codellama-34b-instruct": 0.0,
"llama 3 8b-instruct": 0.0070422535,
"gpt-3.5-turbo": 0.0183333333,
"codellama-13b-instruct": 0.005,
"mistral-medium-latest": 0.0018382353,
"codellama-70b-instruct": 0.0154109589
}
}