File size: 9,737 Bytes
45e320b
 
 
b8f4692
 
 
45e320b
 
6037f36
 
 
 
 
24a0949
6037f36
 
 
b5117fc
6037f36
 
 
 
 
b5117fc
 
 
 
 
 
6037f36
 
 
 
 
 
 
 
 
 
 
 
b8f4692
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6037f36
 
 
 
b5117fc
 
6037f36
45e320b
 
 
 
6037f36
45e320b
 
6037f36
45e320b
 
 
 
 
 
 
 
6037f36
45e320b
 
b5117fc
 
 
6037f36
45e320b
 
 
 
 
6037f36
 
 
 
 
 
45e320b
 
 
 
 
b5117fc
 
 
 
 
 
45e320b
 
24a0949
45e320b
24a0949
 
 
 
45e320b
b8f4692
 
 
 
 
b5117fc
 
 
 
b8f4692
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b5117fc
 
 
 
b8f4692
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b5117fc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
import streamlit as st
import pandas as pd

st.set_page_config(page_title="Cyber Benchmark Hub:  Leaderboard", layout="wide")

st.title("Cyber Benchmark Hub:  Leaderboard")


with st.sidebar:
    st.image("https://cdn.prod.website-files.com/630f558f2a15ca1e88a2f774/631f1436ad7a0605fecc5e15_Logo.svg", use_container_width=True)
    st.markdown("[Priam.ai](https://www.priam.ai/)")
    st.divider()
    
    dataset_categories = ["Multiple Choice"]
    selected_category = st.selectbox("Select Dataset Category", dataset_categories, index=0)
    
    datasets_by_category = {
        "Multiple Choice": ["secQA","CyberMetric80"],
    }
    dataset_choice = st.selectbox("Select Dataset", datasets_by_category[selected_category], index=0)
    
    st.divider()
    st.header("Filters & Options")
    #dataset_version = st.radio("Select Dataset Version", ["v1", "v2"])
    if dataset_choice == "secQA":
        dataset_version = st.radio("Select Dataset Version", ["v1", "v2"])
    else:
        st.markdown("**Note:** Only CyberMetric80 has been evaluated")
        dataset_version = "v1"
    # For filtering the leaderboard by model type
    # Note: The available model types will come from the CSV, once loaded.
    # We'll load the CSV later and then update this filter accordingly.
    source_filter_placeholder = st.empty()  # placeholder for source filter after data is loaded
    
    st.markdown("---")
    st.header("Test Parameters")
    test_params = pd.DataFrame({
        "Value": [0, 1, 0, 1, 0]
    }, index=["Temperature", "n", "Presence Penalty", "Top_p", "Frequency Penalty"])
    st.table(test_params)

# Function to estimate random baseline accuracy for MCQ datasets
def estimate_random_accuracy(questions):
    """
    Estimates the average accuracy when answering questions randomly.
    
    Args:
        questions: List of tuples where each tuple is (question_id, num_choices)
    
    Returns:
        The estimated average accuracy (probability of correct answers)
    """
    if not questions:
        return 0.0
    
    total_probability = 0.0
    for question_id, num_choices in questions:
        probability = 1.0 / num_choices
        total_probability += probability
    
    average_accuracy = total_probability / len(questions)
    return average_accuracy

# For the SECQA dataset we assume each question has 4 choices.
# According to the dataset card, there are 242 questions.
total_questions = 242
questionnaire = [(1, 4), (2, 1), (3, 4), (4, 2), (5, 3), (6, 3), (7, 4), (8, 2), (9, 4), (10, 2), (11, 4), (12, 4), (13, 2), (14, 2), (15, 4), (16, 4), (17, 2), (18, 2), (19, 2), (20, 1), (21, 2), (22, 4), (23, 1), (24, 4), (25, 3), (26, 3), (27, 2), (28, 3), (29, 2), (30, 1), (31, 2), (32, 3), (33, 3), (34, 2), (35, 4), (36, 3), (37, 1), (38, 2), (39, 1), (40, 2), (41, 1), (42, 3), (43, 3), (44, 1), (45, 3), (46, 1), (47, 4), (48, 2), (49, 2), (50, 4), (51, 2), (52, 4), (53, 1), (54, 4), (55, 3), (56, 3), (57, 3), (58, 1), (59, 2), (60, 4), (61, 1), (62, 3), (63, 1), (64, 3), (65, 1), (66, 3), (67, 4), (68, 1), (69, 1), (70, 1), (71, 3), (72, 2), (73, 1), (74, 2), (75, 3), (76, 3), (77, 3), (78, 4), (79, 1), (80, 4), (81, 4), (82, 4), (83, 2), (84, 3), (85, 2), (86, 1), (87, 1), (88, 2), (89, 2), (90, 2), (91, 4), (92, 4), (93, 3), (94, 2), (95, 3), (96, 3), (97, 2), (98, 4), (99, 4), (100, 3), (101, 4), (102, 2), (103, 4), (104, 2), (105, 3), (106, 2), (107, 3), (108, 4), (109, 4), (110, 2)]
questionnairev2 = [(1, 4), (2, 4), (3, 2), (4, 3), (5, 2), (6, 4), (7, 3), (8, 2), (9, 3), (10, 2), (11, 1), (12, 2), (13, 3), (14, 2), (15, 4), (16, 2), (17, 2), (18, 4), (19, 4), (20, 3), (21, 4), (22, 3), (23, 3), (24, 3), (25, 1), (26, 1), (27, 2), (28, 2), (29, 2), (30, 2), (31, 2), (32, 4), (33, 3), (34, 3), (35, 3), (36, 3), (37, 4), (38, 3), (39, 3), (40, 4), (41, 1), (42, 2), (43, 3), (44, 2), (45, 1), (46, 1), (47, 2), (48, 4), (49, 2), (50, 1), (51, 3), (52, 1), (53, 4), (54, 4), (55, 2), (56, 3), (57, 2), (58, 2), (59, 1), (60, 3), (61, 3), (62, 1), (63, 2), (64, 2), (65, 3), (66, 4), (67, 3), (68, 3), (69, 1), (70, 1), (71, 3), (72, 1), (73, 2), (74, 4), (75, 4), (76, 1), (77, 4), (78, 4), (79, 3), (80, 1), (81, 2), (82, 2), (83, 3), (84, 2), (85, 1), (86, 2), (87, 4), (88, 2), (89, 2), (90, 4), (91, 3), (92, 2), (93, 1), (94, 2), (95, 3), (96, 1), (97, 1), (98, 4), (99, 1), (100, 1)]
random_accuracy = estimate_random_accuracy(questionnaire)
random_accuracyv2 = estimate_random_accuracy(questionnairev2)


# Determine file path based on dataset choice.
# For now, if dataset_choice is "secQA", we use "Benchmark.csv"
if dataset_choice == "secQA":
    file_path = "Benchmark.csv"  # Ensure this file is uploaded in your Hugging Face Space
elif dataset_choice == "CyberMetric80":
    file_path = "metric.csv"  # Placeholder: update with actual file paths for future datasets

# Function to load and clean CSV data
@st.cache_data
def load_data(file_path):
    df = pd.read_csv(file_path)
    
    # Remove any unnamed columns (caused by trailing commas)
    df = df.loc[:, ~df.columns.str.contains('Unnamed', na=False)]
    
    # Standardize column names
    df.columns = df.columns.str.strip()
    df.rename(columns={
        "model name": "Model",
        "source": "Type",
        "v1 metric": "V1 Accuracy",
        "v2 metric": "V2 Accuracy"
    }, inplace=True)
    
    # Convert percentage strings to floats (e.g., "100%" → 1.0)
    for col in ["V1 Accuracy", "V2 Accuracy"]:
        if col in df.columns:
            df[col] = df[col].astype(str).str.replace("%", "").str.strip()
            df[col] = pd.to_numeric(df[col], errors='coerce') / 100
    
    return df

# Load dataset
df = load_data(file_path)

# Update the source filter with the actual options from the data
source_filter = source_filter_placeholder.multiselect(
    "Select Model Type",
    options=df["Type"].unique().tolist(),
    default=df["Type"].unique().tolist()
)

# Apply filtering based on the sidebar selections
df_filtered = df[df["Type"].isin(source_filter)] if source_filter else df

# Choose the correct metric version and compute Accuracy
#df_filtered["Accuracy"] = df_filtered["V1 Accuracy"] if dataset_version == "v1" else df_filtered["V2 Accuracy"]
if dataset_choice == "CyberMetric80":
    df_filtered["Accuracy"] = df_filtered["V1 Accuracy"]
else:
    df_filtered["Accuracy"] = df_filtered["V1 Accuracy"] if dataset_version == "v1" else df_filtered["V2 Accuracy"]

df_filtered = df_filtered[["Model", "Type", "Accuracy"]].dropna()  # Drop rows with errors

# Sort by Accuracy descending
df_filtered = df_filtered.sort_values("Accuracy", ascending=False).reset_index(drop=True)

# Compute dense ranking so that models with equal accuracy share the same rank
df_filtered['Rank'] = df_filtered['Accuracy'].rank(method='dense', ascending=False).astype(int)
df_filtered = df_filtered[['Rank', 'Model', 'Type', 'Accuracy']]



tab1, tab2 = st.tabs(["Leaderboard", "About"])

with tab1:
    if dataset_choice == "secQA":
        st.markdown("#### [View the SECQA Dataset](https://huggingface.co/datasets/zefang-liu/secqa)")
    elif dataset_choice == "CyberMetric80":
        st.markdown("#### [View the CyberMetric Dataset](https://github.com/cybermetric/CyberMetric)")

    # Use columns to display leaderboard and model details side-by-side
    col1, col2 = st.columns([2, 1])

    with col1:
        st.subheader(f"Leaderboard for {dataset_choice.upper()} Version {dataset_version}")
        st.dataframe(df_filtered.style.hide(axis='index'))

    with col2:
        st.subheader("Model Details")
        selected_model = st.selectbox("Select a Model", df_filtered["Model"].tolist())
        model_details = df_filtered[df_filtered["Model"] == selected_model].iloc[0]
        st.write(f"**Model:** {model_details['Model']}")
        st.write(f"**Type:** {model_details['Type']}")
        st.write(f"**Accuracy:** {model_details['Accuracy']:.2%}")
        st.write(f"**Rank:** {model_details['Rank']}")

        st.divider()
        # Display the random baseline accuracy above the leaderboard
        if dataset_choice == "secQA":
            st.markdown("### Random Baseline Accuracy")
            st.markdown("**{:.2%}** (computed with random guessing on SECQAv1)".format(random_accuracy))
            st.markdown("**{:.2%}** (computed with random guessing on SECQAv2)".format(random_accuracyv2))

    # Footer
    st.markdown("---")
    st.info("More dataset benchmarks will be added to this hub in the future.")

with tab2:
    st.title("About the Cyber Benchmark Hub")
    st.markdown("""
    Welcome to the **Cyber Benchmark Hub:  Leaderboard**!

    This application benchmarks language models on their performance across cybersecurity question-answering tasks using the [SECQA dataset](https://huggingface.co/datasets/zefang-liu/secqa). It provides an interactive interface to explore model accuracy, rank models, and understand how different model types perform on security-centric multiple-choice questions.

    
    ### Leaderboard Features

    - Compare **different models** (e.g., GPT, Claude, Mistral) based on SECQA v1 or v2.
    - Filter by **model type/source** (open-source, closed)
    - View **dense rankings** where models with equal accuracy share the same rank.
    - See detailed information for each model, including:
      - Accuracy score
      - Rank
    

    ### Random Baseline Accuracy

    The app computes the **expected accuracy** if a model guessed randomly on all questions:

    This helps contextualize the actual performance of models.



    ### Built by

    [Priam.ai](https://www.priam.ai/) 

    *This benchmark hub will continue to expand as more models and datasets are released in the cybersecurity NLP space.*
    """)