Spaces:
Running
Running
| import React, { useState, useEffect } from 'react'; | |
| const BenchmarkChart = () => { | |
| // Real data sorted by Human Baseline ASR (highest to lowest) | |
| const benchmarkData = [ | |
| { | |
| model: "Grok 4", | |
| baseline: 68.67, | |
| methods: { | |
| keyword_objective_combined: 85.15 | |
| } | |
| }, | |
| { | |
| model: "Deepseek R1-0528", | |
| baseline: 68.67, | |
| methods: { | |
| keyword_objective_combined: 83.76 | |
| } | |
| }, | |
| { | |
| model: "Gemini 2.5 Pro", | |
| baseline: 55.67, | |
| methods: { | |
| keyword_objective_combined: 74.14, | |
| root_problem: 67.19 | |
| } | |
| }, | |
| { | |
| model: "Mixtral 8x22B", | |
| baseline: 48.00, | |
| methods: { | |
| keyword_objective_combined: 66.82 | |
| } | |
| }, | |
| { | |
| model: "Llama 4 Maverick Instruct", | |
| baseline: 45.00, | |
| methods: { | |
| keyword_objective_combined: 56.46 | |
| } | |
| }, | |
| { | |
| model: "Gemini 3", | |
| baseline: 32.50, | |
| methods: { | |
| bio_topic_change: 53.68 | |
| } | |
| }, | |
| { | |
| model: "Claude 4 Sonnet", | |
| baseline: 26.33, | |
| methods: { | |
| keyword_objective_combined: 28.64 | |
| } | |
| }, | |
| { | |
| model: "GPT o3", | |
| baseline: 22.00, | |
| methods: { | |
| keyword_objective_combined: 30.53 | |
| } | |
| }, | |
| { | |
| model: "Claude Opus 4.1", | |
| baseline: 20.67, | |
| methods: { | |
| keyword_objective_combined: 23.56 | |
| } | |
| }, | |
| { | |
| model: "GPT 5", | |
| baseline: 8.33, | |
| methods: { | |
| keyword_objective_combined: 11.68, | |
| root_problem: 12.46 | |
| } | |
| } | |
| ]; | |
| const [currentPhase, setCurrentPhase] = useState('baseline'); | |
| const getCurrentValue = (modelData, phase) => { | |
| if (phase === 'baseline') { | |
| return modelData.baseline; | |
| } else if (phase === 'transformation') { | |
| // Return the maximum value from all available transformation methods | |
| let maxASR = modelData.baseline; | |
| Object.values(modelData.methods).forEach(value => { | |
| maxASR = Math.max(maxASR, value); | |
| }); | |
| return maxASR; | |
| } | |
| return 0; | |
| }; | |
| const handleToggle = () => { | |
| setCurrentPhase(currentPhase === 'baseline' ? 'transformation' : 'baseline'); | |
| }; | |
| return ( | |
| <div className="min-h-screen bg-gradient-to-br from-slate-900 to-slate-800 p-4"> | |
| <div className="max-w-6xl mx-auto"> | |
| {/* Header */} | |
| <div className="text-center mb-6"> | |
| <h1 className="text-3xl font-bold text-white mb-3"> | |
| JulyAI Safety Benchmark For Frontier Models | |
| </h1> | |
| <p className="text-slate-300"> | |
| Attack Success Rate Analysis with Transformation Methods | |
| </p> | |
| {/* Control Button */} | |
| <div className="flex justify-center"> | |
| <button | |
| onClick={handleToggle} | |
| className="px-6 py-2 bg-blue-600 hover:bg-blue-700 text-white rounded-lg font-semibold transition-colors" | |
| > | |
| {currentPhase === 'baseline' ? 'Show Transformation Impact' : 'Reset to Baseline'} | |
| </button> | |
| </div> | |
| </div> | |
| {/* Chart Container - Longer */} | |
| <div className="bg-white rounded-xl shadow-2xl p-4"> | |
| {/* Legend - At Top of Chart */} | |
| <div className="mb-6 flex justify-center space-x-6 text-sm bg-gray-50 p-3 rounded-lg"> | |
| <div className="flex items-center space-x-2"> | |
| <div className="w-3 h-3 bg-gradient-to-r from-blue-500 to-blue-600 rounded"></div> | |
| <span className="text-gray-700">Human Baseline</span> | |
| </div> | |
| <div className="flex items-center space-x-2"> | |
| <div className="w-3 h-3 bg-gradient-to-r from-green-500 to-green-600 rounded"></div> | |
| <span className="text-gray-700">Transformation Extension</span> | |
| </div> | |
| </div> | |
| <div className="h-[600px] overflow-y-auto pr-2"> | |
| <div className="space-y-2"> | |
| {benchmarkData.map((modelData, index) => { | |
| const currentValue = getCurrentValue(modelData, currentPhase); | |
| const baselineValue = modelData.baseline; | |
| const maxValue = 90; | |
| const baselineWidth = (baselineValue / maxValue) * 100; | |
| const totalWidth = (currentValue / maxValue) * 100; | |
| const extensionWidth = totalWidth - baselineWidth; | |
| const gain = currentValue - baselineValue; | |
| return ( | |
| <div key={modelData.model} className="relative"> | |
| {/* Model Name and Value - Cleaner Layout */} | |
| <div className="flex items-center justify-between mb-1"> | |
| <div> | |
| <h3 className="font-semibold text-gray-800 text-sm"> | |
| {modelData.model} | |
| </h3> | |
| </div> | |
| <div className="text-right flex items-center space-x-2"> | |
| {gain > 0 && ( | |
| <span className="text-xs font-semibold text-green-600"> | |
| +{gain.toFixed(1)} | |
| </span> | |
| )} | |
| <span className="text-lg font-bold text-gray-700"> | |
| {currentValue.toFixed(1)}% | |
| </span> | |
| </div> | |
| </div> | |
| {/* Progress Bar - Shows Growth from Baseline */} | |
| <div className="relative h-6 bg-gray-200 rounded-full overflow-hidden"> | |
| {/* Baseline Bar (Blue) - Flat, no rounding for seamless extension */} | |
| <div | |
| className="absolute left-0 top-0 h-full bg-gradient-to-r from-blue-500 to-blue-600" | |
| style={{ width: `${Math.max(baselineWidth, 5)}%` }} | |
| /> | |
| {/* Extension Bar (Green) - Only rounded at the end */} | |
| <div | |
| className="absolute top-0 h-full bg-gradient-to-r from-green-500 to-green-600 rounded-r-full transition-all duration-1000 ease-out" | |
| style={{ | |
| left: `${baselineWidth}%`, | |
| width: currentPhase === 'transformation' ? `${extensionWidth}%` : '0%' | |
| }} | |
| /> | |
| </div> | |
| </div> | |
| ); | |
| })} | |
| </div> | |
| </div> | |
| </div> | |
| {/* Methodology Note - Moved Below Chart */} | |
| <div className="mt-6 p-3 bg-yellow-900/30 border border-yellow-500/30 rounded-lg max-w-4xl mx-auto"> | |
| <div className="flex items-start space-x-3"> | |
| <div className="text-yellow-400 mt-1">⚠️</div> | |
| <div className="text-left"> | |
| <p className="text-yellow-200 font-semibold mb-2">Methodology Note</p> | |
| <p className="text-yellow-100 text-sm leading-relaxed"> | |
| <strong>Additive Visualization:</strong> This chart shows cumulative impact by progressively adding each transformation method's individual attack success rate. | |
| Values >100% represent transformation of multiple conversations off one failed, human seed conversation. | |
| Results are based on HarmBench Grading methodology and should be interpreted as relative performance indicators. | |
| </p> | |
| </div> | |
| </div> | |
| </div> | |
| {/* Footer */} | |
| <div className="mt-4 text-center text-slate-400 space-y-1"> | |
| <p className="text-sm"> | |
| Sorted by Human Baseline ASR (highest to lowest) • Click button above for manual control | |
| </p> | |
| <p className="text-xs"> | |
| Bars extend from baseline to show transformation method impact | |
| </p> | |
| </div> | |
| </div> | |
| </div> | |
| ); | |
| }; | |
| export default BenchmarkChart; |