File size: 9,737 Bytes
45e320b b8f4692 45e320b 6037f36 24a0949 6037f36 b5117fc 6037f36 b5117fc 6037f36 b8f4692 6037f36 b5117fc 6037f36 45e320b 6037f36 45e320b 6037f36 45e320b 6037f36 45e320b b5117fc 6037f36 45e320b 6037f36 45e320b b5117fc 45e320b 24a0949 45e320b 24a0949 45e320b b8f4692 b5117fc b8f4692 b5117fc b8f4692 b5117fc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 |
import streamlit as st
import pandas as pd
st.set_page_config(page_title="Cyber Benchmark Hub: Leaderboard", layout="wide")
st.title("Cyber Benchmark Hub: Leaderboard")
with st.sidebar:
st.image("https://cdn.prod.website-files.com/630f558f2a15ca1e88a2f774/631f1436ad7a0605fecc5e15_Logo.svg", use_container_width=True)
st.markdown("[Priam.ai](https://www.priam.ai/)")
st.divider()
dataset_categories = ["Multiple Choice"]
selected_category = st.selectbox("Select Dataset Category", dataset_categories, index=0)
datasets_by_category = {
"Multiple Choice": ["secQA","CyberMetric80"],
}
dataset_choice = st.selectbox("Select Dataset", datasets_by_category[selected_category], index=0)
st.divider()
st.header("Filters & Options")
#dataset_version = st.radio("Select Dataset Version", ["v1", "v2"])
if dataset_choice == "secQA":
dataset_version = st.radio("Select Dataset Version", ["v1", "v2"])
else:
st.markdown("**Note:** Only CyberMetric80 has been evaluated")
dataset_version = "v1"
# For filtering the leaderboard by model type
# Note: The available model types will come from the CSV, once loaded.
# We'll load the CSV later and then update this filter accordingly.
source_filter_placeholder = st.empty() # placeholder for source filter after data is loaded
st.markdown("---")
st.header("Test Parameters")
test_params = pd.DataFrame({
"Value": [0, 1, 0, 1, 0]
}, index=["Temperature", "n", "Presence Penalty", "Top_p", "Frequency Penalty"])
st.table(test_params)
# Function to estimate random baseline accuracy for MCQ datasets
def estimate_random_accuracy(questions):
"""
Estimates the average accuracy when answering questions randomly.
Args:
questions: List of tuples where each tuple is (question_id, num_choices)
Returns:
The estimated average accuracy (probability of correct answers)
"""
if not questions:
return 0.0
total_probability = 0.0
for question_id, num_choices in questions:
probability = 1.0 / num_choices
total_probability += probability
average_accuracy = total_probability / len(questions)
return average_accuracy
# For the SECQA dataset we assume each question has 4 choices.
# According to the dataset card, there are 242 questions.
total_questions = 242
questionnaire = [(1, 4), (2, 1), (3, 4), (4, 2), (5, 3), (6, 3), (7, 4), (8, 2), (9, 4), (10, 2), (11, 4), (12, 4), (13, 2), (14, 2), (15, 4), (16, 4), (17, 2), (18, 2), (19, 2), (20, 1), (21, 2), (22, 4), (23, 1), (24, 4), (25, 3), (26, 3), (27, 2), (28, 3), (29, 2), (30, 1), (31, 2), (32, 3), (33, 3), (34, 2), (35, 4), (36, 3), (37, 1), (38, 2), (39, 1), (40, 2), (41, 1), (42, 3), (43, 3), (44, 1), (45, 3), (46, 1), (47, 4), (48, 2), (49, 2), (50, 4), (51, 2), (52, 4), (53, 1), (54, 4), (55, 3), (56, 3), (57, 3), (58, 1), (59, 2), (60, 4), (61, 1), (62, 3), (63, 1), (64, 3), (65, 1), (66, 3), (67, 4), (68, 1), (69, 1), (70, 1), (71, 3), (72, 2), (73, 1), (74, 2), (75, 3), (76, 3), (77, 3), (78, 4), (79, 1), (80, 4), (81, 4), (82, 4), (83, 2), (84, 3), (85, 2), (86, 1), (87, 1), (88, 2), (89, 2), (90, 2), (91, 4), (92, 4), (93, 3), (94, 2), (95, 3), (96, 3), (97, 2), (98, 4), (99, 4), (100, 3), (101, 4), (102, 2), (103, 4), (104, 2), (105, 3), (106, 2), (107, 3), (108, 4), (109, 4), (110, 2)]
questionnairev2 = [(1, 4), (2, 4), (3, 2), (4, 3), (5, 2), (6, 4), (7, 3), (8, 2), (9, 3), (10, 2), (11, 1), (12, 2), (13, 3), (14, 2), (15, 4), (16, 2), (17, 2), (18, 4), (19, 4), (20, 3), (21, 4), (22, 3), (23, 3), (24, 3), (25, 1), (26, 1), (27, 2), (28, 2), (29, 2), (30, 2), (31, 2), (32, 4), (33, 3), (34, 3), (35, 3), (36, 3), (37, 4), (38, 3), (39, 3), (40, 4), (41, 1), (42, 2), (43, 3), (44, 2), (45, 1), (46, 1), (47, 2), (48, 4), (49, 2), (50, 1), (51, 3), (52, 1), (53, 4), (54, 4), (55, 2), (56, 3), (57, 2), (58, 2), (59, 1), (60, 3), (61, 3), (62, 1), (63, 2), (64, 2), (65, 3), (66, 4), (67, 3), (68, 3), (69, 1), (70, 1), (71, 3), (72, 1), (73, 2), (74, 4), (75, 4), (76, 1), (77, 4), (78, 4), (79, 3), (80, 1), (81, 2), (82, 2), (83, 3), (84, 2), (85, 1), (86, 2), (87, 4), (88, 2), (89, 2), (90, 4), (91, 3), (92, 2), (93, 1), (94, 2), (95, 3), (96, 1), (97, 1), (98, 4), (99, 1), (100, 1)]
random_accuracy = estimate_random_accuracy(questionnaire)
random_accuracyv2 = estimate_random_accuracy(questionnairev2)
# Determine file path based on dataset choice.
# For now, if dataset_choice is "secQA", we use "Benchmark.csv"
if dataset_choice == "secQA":
file_path = "Benchmark.csv" # Ensure this file is uploaded in your Hugging Face Space
elif dataset_choice == "CyberMetric80":
file_path = "metric.csv" # Placeholder: update with actual file paths for future datasets
# Function to load and clean CSV data
@st.cache_data
def load_data(file_path):
df = pd.read_csv(file_path)
# Remove any unnamed columns (caused by trailing commas)
df = df.loc[:, ~df.columns.str.contains('Unnamed', na=False)]
# Standardize column names
df.columns = df.columns.str.strip()
df.rename(columns={
"model name": "Model",
"source": "Type",
"v1 metric": "V1 Accuracy",
"v2 metric": "V2 Accuracy"
}, inplace=True)
# Convert percentage strings to floats (e.g., "100%" → 1.0)
for col in ["V1 Accuracy", "V2 Accuracy"]:
if col in df.columns:
df[col] = df[col].astype(str).str.replace("%", "").str.strip()
df[col] = pd.to_numeric(df[col], errors='coerce') / 100
return df
# Load dataset
df = load_data(file_path)
# Update the source filter with the actual options from the data
source_filter = source_filter_placeholder.multiselect(
"Select Model Type",
options=df["Type"].unique().tolist(),
default=df["Type"].unique().tolist()
)
# Apply filtering based on the sidebar selections
df_filtered = df[df["Type"].isin(source_filter)] if source_filter else df
# Choose the correct metric version and compute Accuracy
#df_filtered["Accuracy"] = df_filtered["V1 Accuracy"] if dataset_version == "v1" else df_filtered["V2 Accuracy"]
if dataset_choice == "CyberMetric80":
df_filtered["Accuracy"] = df_filtered["V1 Accuracy"]
else:
df_filtered["Accuracy"] = df_filtered["V1 Accuracy"] if dataset_version == "v1" else df_filtered["V2 Accuracy"]
df_filtered = df_filtered[["Model", "Type", "Accuracy"]].dropna() # Drop rows with errors
# Sort by Accuracy descending
df_filtered = df_filtered.sort_values("Accuracy", ascending=False).reset_index(drop=True)
# Compute dense ranking so that models with equal accuracy share the same rank
df_filtered['Rank'] = df_filtered['Accuracy'].rank(method='dense', ascending=False).astype(int)
df_filtered = df_filtered[['Rank', 'Model', 'Type', 'Accuracy']]
tab1, tab2 = st.tabs(["Leaderboard", "About"])
with tab1:
if dataset_choice == "secQA":
st.markdown("#### [View the SECQA Dataset](https://huggingface.co/datasets/zefang-liu/secqa)")
elif dataset_choice == "CyberMetric80":
st.markdown("#### [View the CyberMetric Dataset](https://github.com/cybermetric/CyberMetric)")
# Use columns to display leaderboard and model details side-by-side
col1, col2 = st.columns([2, 1])
with col1:
st.subheader(f"Leaderboard for {dataset_choice.upper()} Version {dataset_version}")
st.dataframe(df_filtered.style.hide(axis='index'))
with col2:
st.subheader("Model Details")
selected_model = st.selectbox("Select a Model", df_filtered["Model"].tolist())
model_details = df_filtered[df_filtered["Model"] == selected_model].iloc[0]
st.write(f"**Model:** {model_details['Model']}")
st.write(f"**Type:** {model_details['Type']}")
st.write(f"**Accuracy:** {model_details['Accuracy']:.2%}")
st.write(f"**Rank:** {model_details['Rank']}")
st.divider()
# Display the random baseline accuracy above the leaderboard
if dataset_choice == "secQA":
st.markdown("### Random Baseline Accuracy")
st.markdown("**{:.2%}** (computed with random guessing on SECQAv1)".format(random_accuracy))
st.markdown("**{:.2%}** (computed with random guessing on SECQAv2)".format(random_accuracyv2))
# Footer
st.markdown("---")
st.info("More dataset benchmarks will be added to this hub in the future.")
with tab2:
st.title("About the Cyber Benchmark Hub")
st.markdown("""
Welcome to the **Cyber Benchmark Hub: Leaderboard**!
This application benchmarks language models on their performance across cybersecurity question-answering tasks using the [SECQA dataset](https://huggingface.co/datasets/zefang-liu/secqa). It provides an interactive interface to explore model accuracy, rank models, and understand how different model types perform on security-centric multiple-choice questions.
### Leaderboard Features
- Compare **different models** (e.g., GPT, Claude, Mistral) based on SECQA v1 or v2.
- Filter by **model type/source** (open-source, closed)
- View **dense rankings** where models with equal accuracy share the same rank.
- See detailed information for each model, including:
- Accuracy score
- Rank
### Random Baseline Accuracy
The app computes the **expected accuracy** if a model guessed randomly on all questions:
This helps contextualize the actual performance of models.
### Built by
[Priam.ai](https://www.priam.ai/)
*This benchmark hub will continue to expand as more models and datasets are released in the cybersecurity NLP space.*
""")
|