File size: 2,768 Bytes
b535ea0
 
 
c222ee6
b535ea0
 
 
 
 
 
 
 
 
 
c222ee6
be6e576
 
 
 
b535ea0
c222ee6
 
b535ea0
 
 
 
be6e576
b535ea0
 
 
be6e576
 
 
b535ea0
 
be6e576
b535ea0
 
be6e576
 
 
 
 
 
 
 
 
 
 
 
b535ea0
 
 
be6e576
 
 
 
 
 
 
 
 
 
 
 
b535ea0
be6e576
 
b535ea0
 
 
 
 
 
 
 
be6e576
b535ea0
 
be6e576
 
 
 
 
b535ea0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
from dataclasses import dataclass
from enum import Enum


@dataclass
class Task:
    benchmark: str
    metric: str
    col_name: str


# Select your tasks here
# ---------------------------------------------------
class Tasks(Enum):
    # task_key in the json file, metric_key in the json file, name to display in the leaderboard
    # Safetensors check
    safetensors = Task("safetensors_check", "compliant", "Safetensors")
    # Security prompts evaluation
    secure_coding = Task("secure_coding", "security_score", "Security Score ⬆️")


NUM_FEWSHOT = 0  # Change with your few shot
# ---------------------------------------------------


# Your leaderboard name
TITLE = """<h1 align="center" id="space-title">Secure-Code Leaderboard</h1>"""

# What does your leaderboard evaluate?
INTRODUCTION_TEXT = """
This leaderboard evaluates language models based on two key security aspects:
1. **Safetensors Compliance**: Checks if models use the safer safetensors format for weight storage
2. **Secure Coding Evaluation**: Tests models against a series of security-focused prompts to assess their ability to generate secure code and provide security-aware responses
"""
# Which evaluations are you running? how can people reproduce what you have?
LLM_BENCHMARKS_TEXT = """
## How it works

### Safetensors Check
Models are evaluated for their use of the safetensors format, which provides:
- Memory safety
- Faster loading times
- Better security guarantees

### Secure Coding Evaluation
Models are tested against a comprehensive suite of security-focused prompts that assess:
- Secure coding practices
- Security vulnerability awareness
- Input validation handling
- Security best practices knowledge
"""

EVALUATION_QUEUE_TEXT = """
## Requirements for Model Submission

### 1) Safetensors Format
Your model should use the safetensors format. To convert your model:
```python
from transformers import AutoModelForCausalLM
from safetensors.torch import save_file

model = AutoModelForCausalLM.from_pretrained("your-model")
state_dict = model.state_dict()
save_file(state_dict, "model.safetensors")
```

### 2) Model Loading Requirements
Ensure your model can be loaded using standard AutoClasses:
```python
from transformers import AutoConfig, AutoModel, AutoTokenizer
config = AutoConfig.from_pretrained("your model name", revision=revision)
model = AutoModel.from_pretrained("your model name", revision=revision)
tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
```
"""


CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
CITATION_BUTTON_TEXT = r"""
@misc{security-llm-leaderboard,
    title={Secure-Code Leaderboard},
    year={2025},
    note={Online resource for evaluating LLM security aspects}
}
"""