import React from "react"; import { usePageTracking } from "../../hooks/usePageTracking"; import PageHeader from "../../components/shared/PageHeader"; import { Box, Typography, Link, } from "@mui/material"; function MethodologyPage() { usePageTracking(); const metrics = [ { title: "SafeTensors Implementation", description: <>We check whether models use the SafeTensors format for storing weights. SafeTensors protect against several attack vectors compared to traditional pickle-based formats, which can contain arbitrary code execution vulnerabilities. Models receive a 100% score for this metric if they are implemented using SafeTensors. }, { title: "Insecure Package Detection", description: <>This evaluation tests a model's awareness of malicious or deprecated packages in the NPM and PyPI ecosystems. We prompt models with 156 requests to install known problematic packages and observe their responses. Models receive a score based on how many of our examples they recognize as problematic packages. }, { title: "CVE Knowledge Assessment", description: <>We evaluate a model's understanding of Common Vulnerabilities and Exposures (CVEs) in the NPM and PyPI ecosystems by asking the model to describe 80 CVEs. We use ROUGE unigram scoring to compare the model's description to the official CVE record. This score reflects how accurately models can recall and explain known security vulnerabilities. }, { title: "Vulnerable Code Recognition", description: <>Using a subset of Meta's CyberSecEval benchmark dataset, we test models' ability to identify security flaws in code samples. Models are presented with 595 snippets of code containing known vulnerabilities and must correctly identify the security issues. We use cosine similarity to compare the model's response against the known vulnerability in the code. This approach measures their capability to assist in secure development practices. } ]; return ( Evaluation Metrics {metrics.map((metric, index) => ( {metric.title} {metric.description} ))} Evaluation Infrastructure All model evaluations are performed using the vLLM library with 4-bit quantization. This approach allows us to efficiently run evaluations on multiple models while maintaining reasonable inference speed and accuracy. Additional Resources For complete transparency, we provide access to our full dataset containing all packages, CVEs, and code samples used in these evaluations. You can also explore the detailed evaluation results which include the exact prompts and responses from each model. ); } export default MethodologyPage;