Spaces:
Sleeping
Sleeping
make_leaderboard
#1
by
kenkaneki
- opened
- .env.template +0 -6
- .gitignore +5 -44
- .gitmodules +0 -3
- .gradio/certificate.pem +0 -31
- README.md +28 -209
- SUBMISSION_EXAMPLE.md +0 -266
- app.py +156 -1068
- data/.gitkeep +0 -1
- data/leaderboard_data.json +0 -30
- data/submissions.json +0 -5
- example_submission.jsonl +0 -4
- gradio_test.ipynb +0 -32
- leaderboard_data.json +0 -32
- requirements.txt +16 -8
- src/__init__.py +0 -1
- src/about.py +54 -41
- src/display/__init__.py +0 -1
- src/display/css_html_js.py +74 -66
- src/display/formatting.py +15 -59
- src/display/utils.py +90 -397
- src/envs.py +16 -18
- src/leaderboard/__init__.py +0 -1
- src/leaderboard/processor.py +0 -271
- src/leaderboard/read_evals.py +196 -0
- src/populate.py +54 -184
- src/submission/__init__.py +0 -1
- src/submission/check_validity.py +99 -0
- src/submission/submit.py +114 -179
.env.template
DELETED
@@ -1,6 +0,0 @@
|
|
1 |
-
HF_TOKEN="your_huggingface_write_token"
|
2 |
-
OWNER="your_huggingface_username_or_org"
|
3 |
-
RESULTS_DATASET_ID="your_username/guardbench-results"
|
4 |
-
SUBMITTER_TOKEN="your_secret_submission_token"
|
5 |
-
ADMIN_USERNAME="admin"
|
6 |
-
ADMIN_PASSWORD="password" # Change this!
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.gitignore
CHANGED
@@ -1,52 +1,13 @@
|
|
1 |
-
|
|
|
2 |
__pycache__/
|
3 |
-
*.py[cod]
|
4 |
-
*$py.class
|
5 |
-
*.so
|
6 |
-
.Python
|
7 |
-
env/
|
8 |
-
build/
|
9 |
-
develop-eggs/
|
10 |
-
dist/
|
11 |
-
downloads/
|
12 |
-
eggs/
|
13 |
-
.eggs/
|
14 |
-
lib/
|
15 |
-
lib64/
|
16 |
-
parts/
|
17 |
-
sdist/
|
18 |
-
var/
|
19 |
-
.venv/
|
20 |
-
*.egg-info/
|
21 |
-
.installed.cfg
|
22 |
-
*.egg
|
23 |
-
.gradio/
|
24 |
-
|
25 |
-
# Environment variables
|
26 |
.env
|
27 |
-
|
28 |
-
|
29 |
-
venv/
|
30 |
-
ENV/
|
31 |
-
|
32 |
-
# IDE
|
33 |
-
.idea/
|
34 |
.vscode/
|
35 |
-
*.swp
|
36 |
-
*.swo
|
37 |
-
|
38 |
-
# OS
|
39 |
-
.DS_Store
|
40 |
-
Thumbs.db
|
41 |
|
42 |
-
# Hugging Face cache
|
43 |
eval-queue/
|
44 |
eval-results/
|
45 |
eval-queue-bk/
|
46 |
eval-results-bk/
|
47 |
-
|
48 |
-
# Data files
|
49 |
-
data/
|
50 |
-
|
51 |
-
# Versioned leaderboard files
|
52 |
-
data/leaderboard_v*.json
|
|
|
1 |
+
auto_evals/
|
2 |
+
venv/
|
3 |
__pycache__/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
.env
|
5 |
+
.ipynb_checkpoints
|
6 |
+
*ipynb
|
|
|
|
|
|
|
|
|
|
|
7 |
.vscode/
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
|
|
9 |
eval-queue/
|
10 |
eval-results/
|
11 |
eval-queue-bk/
|
12 |
eval-results-bk/
|
13 |
+
logs/
|
|
|
|
|
|
|
|
|
|
.gitmodules
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
[submodule "guard-bench-submodule"]
|
2 |
-
path = guard-bench-submodule
|
3 |
-
url = https://github.com/whitecircle-ai/circle-guard-bench.git
|
|
|
|
|
|
|
|
.gradio/certificate.pem
DELETED
@@ -1,31 +0,0 @@
|
|
1 |
-
-----BEGIN CERTIFICATE-----
|
2 |
-
MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
|
3 |
-
TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
|
4 |
-
cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
|
5 |
-
WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
|
6 |
-
ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
|
7 |
-
MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
|
8 |
-
h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
|
9 |
-
0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
|
10 |
-
A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
|
11 |
-
T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
|
12 |
-
B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
|
13 |
-
B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
|
14 |
-
KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
|
15 |
-
OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
|
16 |
-
jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
|
17 |
-
qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
|
18 |
-
rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
|
19 |
-
HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
|
20 |
-
hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
|
21 |
-
ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
|
22 |
-
3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
|
23 |
-
NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
|
24 |
-
ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
|
25 |
-
TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
|
26 |
-
jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
|
27 |
-
oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
|
28 |
-
4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
|
29 |
-
mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
|
30 |
-
emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
|
31 |
-
-----END CERTIFICATE-----
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
README.md
CHANGED
@@ -1,227 +1,46 @@
|
|
1 |
---
|
2 |
title: CodeReviewBench
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
colorTo: indigo
|
6 |
sdk: gradio
|
7 |
-
|
8 |
-
sdk_version: 4.44.1
|
9 |
app_file: app.py
|
10 |
pinned: true
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
- openai/gpt-4o
|
15 |
-
- claude-3-7-sonnet
|
16 |
-
- deepseek/deepseek-r1
|
17 |
-
|
18 |
---
|
19 |
|
20 |
-
#
|
21 |
-
|
22 |
-
A comprehensive benchmark and leaderboard for code review generation models, inspired by [CodeReviewBench](https://huggingface.co/spaces/your-org/CodeReviewBench).
|
23 |
-
## Features
|
24 |
-
|
25 |
-
- **Multi-Language Support**: Evaluates models across 17+ programming languages including Python, JavaScript, Java, C++, TypeScript, Go, Rust, and more
|
26 |
-
- **Dual Language Comments**: Supports both Russian and English comment languages
|
27 |
-
- **Comprehensive Metrics**:
|
28 |
-
- LLM-based multimetric evaluation (readability, relevance, explanation clarity, problem identification, actionability, completeness, specificity, contextual adequacy, consistency, brevity)
|
29 |
-
- Exact-match metrics (pass@1, pass@5, pass@10, BLEU@10)
|
30 |
-
- **Interactive Visualization**: Compare model performance across categories with radar plots
|
31 |
-
- **Easy Submission**: Submit your model results via web interface
|
32 |
-
|
33 |
-
## Metrics
|
34 |
-
|
35 |
-
### LLM-based Multimetric
|
36 |
-
|
37 |
-
- **Readability**: How easy the review is to understand
|
38 |
-
- **Relevance**: How relevant the review is to the code
|
39 |
-
- **Explanation Clarity**: How clear the explanations are
|
40 |
-
- **Problem Identification**: How well problems are identified
|
41 |
-
- **Actionability**: How actionable the suggestions are
|
42 |
-
- **Completeness**: How complete the review is
|
43 |
-
- **Specificity**: How specific the feedback is
|
44 |
-
- **Contextual Adequacy**: How well the review fits the context
|
45 |
-
- **Consistency**: How consistent the review style is
|
46 |
-
- **Brevity**: How concise the review is
|
47 |
-
|
48 |
-
### Exact-Match Metrics
|
49 |
-
|
50 |
-
- **Pass@1**: Percentage of correct reviews on first attempt
|
51 |
-
- **Pass@5**: Percentage of correct reviews in top 5 attempts
|
52 |
-
- **Pass@10**: Percentage of correct reviews in top 10 attempts
|
53 |
-
- **BLEU@10**: BLEU score for top 10 review candidates
|
54 |
-
|
55 |
-
## Programming Languages Supported
|
56 |
-
|
57 |
-
- Python
|
58 |
-
- JavaScript
|
59 |
-
- Java
|
60 |
-
- C++
|
61 |
-
- C#
|
62 |
-
- TypeScript
|
63 |
-
- Go
|
64 |
-
- Rust
|
65 |
-
- Swift
|
66 |
-
- Kotlin
|
67 |
-
- Ruby
|
68 |
-
- PHP
|
69 |
-
- C
|
70 |
-
- Scala
|
71 |
-
- R
|
72 |
-
- Dart
|
73 |
-
- Other
|
74 |
-
|
75 |
-
## Comment Languages
|
76 |
-
|
77 |
-
- Russian (ru)
|
78 |
-
- English (en)
|
79 |
|
80 |
-
|
81 |
-
|
82 |
-
- Bug Fix
|
83 |
-
- Code Style
|
84 |
-
- Performance
|
85 |
-
- Security
|
86 |
-
- Refactoring
|
87 |
-
- Documentation
|
88 |
-
- Testing
|
89 |
-
- Architecture
|
90 |
-
- Other
|
91 |
-
|
92 |
-
## Installation
|
93 |
-
|
94 |
-
```bash
|
95 |
-
pip install -r requirements.txt
|
96 |
-
```
|
97 |
-
|
98 |
-
## Usage
|
99 |
-
|
100 |
-
```bash
|
101 |
-
python app.py
|
102 |
-
```
|
103 |
-
|
104 |
-
## Submission Format
|
105 |
-
|
106 |
-
Submit your results as a JSONL file where each line contains:
|
107 |
|
|
|
108 |
```json
|
109 |
{
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
"pass_at_1": 0.75,
|
124 |
-
"pass_at_5": 0.88,
|
125 |
-
"pass_at_10": 0.92,
|
126 |
-
"bleu_at_10": 0.65,
|
127 |
-
"total_evaluations": 100
|
128 |
}
|
129 |
```
|
130 |
|
131 |
-
|
132 |
-
|
133 |
-
Set the following environment variables:
|
134 |
-
|
135 |
-
|
136 |
-
## Citation
|
137 |
|
138 |
-
|
139 |
-
- **Multi-tab Interface**: Organized navigation with dedicated sections
|
140 |
-
- **Advanced Filtering**: Real-time filtering by multiple criteria
|
141 |
-
- **Dark Theme**: Modern, GitHub-inspired dark interface
|
142 |
-
- **IP-based Submissions**: Secure submission tracking
|
143 |
-
- **Comprehensive Analytics**: Detailed performance insights
|
144 |
-
- **Data Export**: Multiple export formats
|
145 |
-
- **Rate Limiting**: Anti-spam protection
|
146 |
|
147 |
-
|
148 |
-
|
149 |
-
- **Modular Architecture**: Clean separation of concerns
|
150 |
-
- **Type Safety**: Full type annotations throughout
|
151 |
-
- **Error Handling**: Comprehensive error handling and logging
|
152 |
-
- **Data Validation**: Multi-layer validation with Pydantic
|
153 |
-
- **Performance**: Optimized data processing and display
|
154 |
-
|
155 |
-
## 📈 Metrics & Evaluation
|
156 |
-
|
157 |
-
### Performance Metrics
|
158 |
-
|
159 |
-
- **BLEU**: Text similarity score (0.0-1.0)
|
160 |
-
- **Pass@1**: Success rate in single attempt (0.0-1.0)
|
161 |
-
- **Pass@5**: Success rate in 5 attempts (0.0-1.0)
|
162 |
-
- **Pass@10**: Success rate in 10 attempts (0.0-1.0)
|
163 |
-
|
164 |
-
### Quality Dimensions
|
165 |
-
|
166 |
-
1. **Readability**: How clear and readable are the reviews?
|
167 |
-
2. **Relevance**: How relevant to the code changes?
|
168 |
-
3. **Explanation Clarity**: How well does it explain issues?
|
169 |
-
4. **Problem Identification**: How effectively does it identify problems?
|
170 |
-
5. **Actionability**: How actionable are the suggestions?
|
171 |
-
6. **Completeness**: How thorough are the reviews?
|
172 |
-
7. **Specificity**: How specific are the comments?
|
173 |
-
8. **Contextual Adequacy**: How well does it understand context?
|
174 |
-
9. **Consistency**: How consistent across different reviews?
|
175 |
-
10. **Brevity**: How concise without losing important information?
|
176 |
-
|
177 |
-
## 🔒 Security Features
|
178 |
-
|
179 |
-
### Rate Limiting
|
180 |
-
|
181 |
-
- **5 submissions per IP per 24 hours**
|
182 |
-
- **Automatic IP tracking and logging**
|
183 |
-
- **Graceful error handling for rate limits**
|
184 |
-
|
185 |
-
### Data Validation
|
186 |
-
|
187 |
-
- **Model name format validation**
|
188 |
-
- **Score range validation (0.0-1.0 for performance, 0-10 for quality)**
|
189 |
-
- **Logical consistency checks (Pass@1 ≤ Pass@5 ≤ Pass@10)**
|
190 |
-
- **Required field validation**
|
191 |
-
|
192 |
-
### Audit Trail
|
193 |
-
|
194 |
-
- **Complete submission logging**
|
195 |
-
- **IP address tracking (partially masked for privacy)**
|
196 |
-
- **Timestamp recording**
|
197 |
-
- **Data integrity checks**
|
198 |
-
|
199 |
-
## 🤝 Contributing
|
200 |
-
|
201 |
-
1. Fork the repository
|
202 |
-
2. Create a feature branch
|
203 |
-
3. Make your changes
|
204 |
-
4. Add tests if applicable
|
205 |
-
5. Submit a pull request
|
206 |
-
|
207 |
-
## 📄 License
|
208 |
-
|
209 |
-
This project is licensed under the MIT License - see the LICENSE file for details.
|
210 |
-
|
211 |
-
## 🙏 Acknowledgments
|
212 |
-
|
213 |
-
- Inspired by [CodeReviewBench](https://huggingface.co/spaces/your-org/CodeReviewBench)
|
214 |
-
- Built with [Gradio](https://gradio.app/) for the web interface
|
215 |
-
- Thanks to the open-source community for tools and inspiration
|
216 |
-
|
217 |
-
## 📞 Support
|
218 |
-
|
219 |
-
For questions, issues, or contributions:
|
220 |
-
|
221 |
-
- Open an issue on GitHub
|
222 |
-
- Check the documentation
|
223 |
-
- Contact the maintainers
|
224 |
-
|
225 |
-
---
|
226 |
|
227 |
-
|
|
|
|
|
|
|
|
1 |
---
|
2 |
title: CodeReviewBench
|
3 |
+
emoji: 🥇
|
4 |
+
colorFrom: green
|
5 |
colorTo: indigo
|
6 |
sdk: gradio
|
|
|
|
|
7 |
app_file: app.py
|
8 |
pinned: true
|
9 |
+
license: mit
|
10 |
+
short_description: Result of benchmark presented in paper CodeReviewBench
|
11 |
+
sdk_version: 5.19.0
|
|
|
|
|
|
|
|
|
12 |
---
|
13 |
|
14 |
+
# Start the configuration
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
|
16 |
+
Most of the variables to change for a default leaderboard are in `src/env.py` (replace the path for your leaderboard) and `src/about.py` (for tasks).
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
|
18 |
+
Results files should have the following format and be stored as json files:
|
19 |
```json
|
20 |
{
|
21 |
+
"config": {
|
22 |
+
"model_dtype": "torch.float16", # or torch.bfloat16 or 8bit or 4bit
|
23 |
+
"model_name": "path of the model on the hub: org/model",
|
24 |
+
"model_sha": "revision on the hub",
|
25 |
+
},
|
26 |
+
"results": {
|
27 |
+
"task_name": {
|
28 |
+
"metric_name": score,
|
29 |
+
},
|
30 |
+
"task_name2": {
|
31 |
+
"metric_name": score,
|
32 |
+
}
|
33 |
+
}
|
|
|
|
|
|
|
|
|
|
|
34 |
}
|
35 |
```
|
36 |
|
37 |
+
Request files are created automatically by this tool.
|
|
|
|
|
|
|
|
|
|
|
38 |
|
39 |
+
If you encounter problem on the space, don't hesitate to restart it to remove the create eval-queue, eval-queue-bk, eval-results and eval-results-bk created folder.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
|
41 |
+
# Code logic for more complex edits
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
|
43 |
+
You'll find
|
44 |
+
- the main table' columns names and properties in `src/display/utils.py`
|
45 |
+
- the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
|
46 |
+
- the logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`
|
SUBMISSION_EXAMPLE.md
DELETED
@@ -1,266 +0,0 @@
|
|
1 |
-
# 📝 Model Submission Example
|
2 |
-
|
3 |
-
This guide shows you exactly how to submit your code review model to the leaderboard.
|
4 |
-
|
5 |
-
## 🚀 Step-by-Step Submission Process
|
6 |
-
|
7 |
-
### 1. **Access the Submission Form**
|
8 |
-
|
9 |
-
- Open the CodeReview Leaderboard in your browser
|
10 |
-
- Navigate to the **📝 Submit Model** tab
|
11 |
-
- Click on the "📝 Submit New Model Results" accordion to expand the form
|
12 |
-
|
13 |
-
### 2. **Fill in Basic Information**
|
14 |
-
|
15 |
-
#### **Model Name** ✨
|
16 |
-
|
17 |
-
```
|
18 |
-
Example: microsoft/CodeT5-base
|
19 |
-
Format: organization/model-name
|
20 |
-
```
|
21 |
-
|
22 |
-
#### **Programming Language** 🔍
|
23 |
-
|
24 |
-
```
|
25 |
-
Select: Python
|
26 |
-
(or Java, JavaScript, C++, Go, Rust, etc.)
|
27 |
-
```
|
28 |
-
|
29 |
-
#### **Comment Language** 🌍
|
30 |
-
|
31 |
-
```
|
32 |
-
Select: English
|
33 |
-
(or Chinese, Spanish, French, German, etc.)
|
34 |
-
```
|
35 |
-
|
36 |
-
#### **Taxonomy Category** 🏷️
|
37 |
-
|
38 |
-
```
|
39 |
-
Select: Bug Detection
|
40 |
-
(or Security, Performance, Code Style, etc.)
|
41 |
-
```
|
42 |
-
|
43 |
-
### 3. **Performance Scores** (0.0 - 1.0)
|
44 |
-
|
45 |
-
#### **BLEU Score**
|
46 |
-
|
47 |
-
```
|
48 |
-
Example: 0.742
|
49 |
-
Range: 0.0 to 1.0
|
50 |
-
Description: Measures similarity between generated and reference reviews
|
51 |
-
```
|
52 |
-
|
53 |
-
#### **Pass@1**
|
54 |
-
|
55 |
-
```
|
56 |
-
Example: 0.685
|
57 |
-
Range: 0.0 to 1.0
|
58 |
-
Description: Success rate when model gets 1 attempt
|
59 |
-
```
|
60 |
-
|
61 |
-
#### **Pass@5**
|
62 |
-
|
63 |
-
```
|
64 |
-
Example: 0.834
|
65 |
-
Range: 0.0 to 1.0
|
66 |
-
Description: Success rate when model gets 5 attempts
|
67 |
-
```
|
68 |
-
|
69 |
-
#### **Pass@10**
|
70 |
-
|
71 |
-
```
|
72 |
-
Example: 0.901
|
73 |
-
Range: 0.0 to 1.0
|
74 |
-
Description: Success rate when model gets 10 attempts
|
75 |
-
```
|
76 |
-
|
77 |
-
### 4. **Quality Metrics** (0 - 10)
|
78 |
-
|
79 |
-
Rate your model across these 10 dimensions:
|
80 |
-
|
81 |
-
#### **Readability: 8**
|
82 |
-
|
83 |
-
```
|
84 |
-
How clear and readable are the generated code reviews?
|
85 |
-
Scale: 0 (unreadable) to 10 (very clear)
|
86 |
-
```
|
87 |
-
|
88 |
-
#### **Relevance: 7**
|
89 |
-
|
90 |
-
```
|
91 |
-
How relevant are the reviews to the actual code changes?
|
92 |
-
Scale: 0 (irrelevant) to 10 (highly relevant)
|
93 |
-
```
|
94 |
-
|
95 |
-
#### **Explanation Clarity: 8**
|
96 |
-
|
97 |
-
```
|
98 |
-
How well does the model explain identified issues?
|
99 |
-
Scale: 0 (unclear) to 10 (very clear explanations)
|
100 |
-
```
|
101 |
-
|
102 |
-
#### **Problem Identification: 7**
|
103 |
-
|
104 |
-
```
|
105 |
-
How effectively does it identify real code problems?
|
106 |
-
Scale: 0 (misses issues) to 10 (finds all problems)
|
107 |
-
```
|
108 |
-
|
109 |
-
#### **Actionability: 6**
|
110 |
-
|
111 |
-
```
|
112 |
-
How actionable and useful are the suggestions?
|
113 |
-
Scale: 0 (not actionable) to 10 (very actionable)
|
114 |
-
```
|
115 |
-
|
116 |
-
#### **Completeness: 7**
|
117 |
-
|
118 |
-
```
|
119 |
-
How thorough and complete are the reviews?
|
120 |
-
Scale: 0 (incomplete) to 10 (comprehensive)
|
121 |
-
```
|
122 |
-
|
123 |
-
#### **Specificity: 6**
|
124 |
-
|
125 |
-
```
|
126 |
-
How specific are the comments and suggestions?
|
127 |
-
Scale: 0 (too generic) to 10 (very specific)
|
128 |
-
```
|
129 |
-
|
130 |
-
#### **Contextual Adequacy: 7**
|
131 |
-
|
132 |
-
```
|
133 |
-
How well does it understand the code context?
|
134 |
-
Scale: 0 (ignores context) to 10 (perfect context understanding)
|
135 |
-
```
|
136 |
-
|
137 |
-
#### **Consistency: 6**
|
138 |
-
|
139 |
-
```
|
140 |
-
How consistent is the model across different code reviews?
|
141 |
-
Scale: 0 (inconsistent) to 10 (very consistent)
|
142 |
-
```
|
143 |
-
|
144 |
-
#### **Brevity: 5**
|
145 |
-
|
146 |
-
```
|
147 |
-
How concise are the reviews without losing important information?
|
148 |
-
Scale: 0 (too verbose/too brief) to 10 (perfect length)
|
149 |
-
```
|
150 |
-
|
151 |
-
### 5. **Submit Your Model**
|
152 |
-
|
153 |
-
- Click the **🚀 Submit Model** button
|
154 |
-
- Wait for validation and processing
|
155 |
-
- Check for success/error message
|
156 |
-
|
157 |
-
## 📋 Complete Example Submission
|
158 |
-
|
159 |
-
Here's a real example of submitting the CodeT5-base model:
|
160 |
-
|
161 |
-
```yaml
|
162 |
-
Model Information:
|
163 |
-
Model Name: "microsoft/CodeT5-base"
|
164 |
-
Programming Language: "Python"
|
165 |
-
Comment Language: "English"
|
166 |
-
Taxonomy Category: "Bug Detection"
|
167 |
-
|
168 |
-
Performance Scores:
|
169 |
-
BLEU Score: 0.742
|
170 |
-
Pass@1: 0.685
|
171 |
-
Pass@5: 0.834
|
172 |
-
Pass@10: 0.901
|
173 |
-
|
174 |
-
Quality Metrics:
|
175 |
-
Readability: 8
|
176 |
-
Relevance: 7
|
177 |
-
Explanation Clarity: 8
|
178 |
-
Problem Identification: 7
|
179 |
-
Actionability: 6
|
180 |
-
Completeness: 7
|
181 |
-
Specificity: 6
|
182 |
-
Contextual Adequacy: 7
|
183 |
-
Consistency: 6
|
184 |
-
Brevity: 5
|
185 |
-
```
|
186 |
-
|
187 |
-
## 🔒 Security & Rate Limiting
|
188 |
-
|
189 |
-
### **IP-based Rate Limiting**
|
190 |
-
|
191 |
-
- **5 submissions per IP address per 24 hours**
|
192 |
-
- Submissions are tracked by your IP address
|
193 |
-
- Rate limit resets every 24 hours
|
194 |
-
|
195 |
-
### **Validation Rules**
|
196 |
-
|
197 |
-
- Model name must follow `organization/model` format
|
198 |
-
- All performance scores must be between 0.0 and 1.0
|
199 |
-
- All quality metrics must be between 0 and 10
|
200 |
-
- Pass@1 ≤ Pass@5 ≤ Pass@10 (logical consistency)
|
201 |
-
|
202 |
-
## ✅ After Submission
|
203 |
-
|
204 |
-
### **Immediate Feedback**
|
205 |
-
|
206 |
-
You'll see one of these messages:
|
207 |
-
|
208 |
-
#### **Success ✅**
|
209 |
-
|
210 |
-
```
|
211 |
-
✅ Submission recorded successfully!
|
212 |
-
```
|
213 |
-
|
214 |
-
#### **Error Examples ❌**
|
215 |
-
|
216 |
-
```
|
217 |
-
❌ Rate limit exceeded: 5/5 submissions in 24 hours
|
218 |
-
❌ Model name contains invalid characters
|
219 |
-
❌ Pass@1 score cannot be higher than Pass@5
|
220 |
-
❌ Score BLEU out of range: 1.2 (must be between 0 and 1)
|
221 |
-
```
|
222 |
-
|
223 |
-
### **View Your Results**
|
224 |
-
|
225 |
-
- Your model will appear in the **🏆 Leaderboard** tab
|
226 |
-
- Use filters to find your specific submission
|
227 |
-
- Check the **📈 Analytics** tab for submission history
|
228 |
-
|
229 |
-
## 🎯 Tips for Better Submissions
|
230 |
-
|
231 |
-
### **Model Naming**
|
232 |
-
|
233 |
-
```
|
234 |
-
✅ Good: "microsoft/CodeT5-base"
|
235 |
-
✅ Good: "facebook/bart-large"
|
236 |
-
✅ Good: "my-org/custom-model-v2"
|
237 |
-
❌ Bad: "my model"
|
238 |
-
❌ Bad: "model@v1.0"
|
239 |
-
```
|
240 |
-
|
241 |
-
### **Performance Scores**
|
242 |
-
|
243 |
-
- Be honest and accurate with your evaluations
|
244 |
-
- Use proper evaluation methodology
|
245 |
-
- Ensure Pass@k scores are logically consistent
|
246 |
-
- Document your evaluation process
|
247 |
-
|
248 |
-
### **Quality Metrics**
|
249 |
-
|
250 |
-
- Rate based on actual model performance
|
251 |
-
- Consider multiple test cases
|
252 |
-
- Be objective in your assessment
|
253 |
-
- Document your rating criteria
|
254 |
-
|
255 |
-
## 🤝 Need Help?
|
256 |
-
|
257 |
-
If you encounter issues:
|
258 |
-
|
259 |
-
1. Check the error message for specific guidance
|
260 |
-
2. Verify all fields are filled correctly
|
261 |
-
3. Ensure you haven't exceeded rate limits
|
262 |
-
4. Contact maintainers if problems persist
|
263 |
-
|
264 |
-
---
|
265 |
-
|
266 |
-
**Ready to submit your model? Head to the 📝 Submit Model tab and follow this guide!** 🚀
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app.py
CHANGED
@@ -1,20 +1,8 @@
|
|
1 |
-
"""
|
2 |
-
CodeReview Leaderboard - Inspired by CodeReviewBench
|
3 |
-
A comprehensive leaderboard for code review generation models
|
4 |
-
"""
|
5 |
-
|
6 |
-
import os
|
7 |
-
import json
|
8 |
-
import tempfile
|
9 |
-
import logging
|
10 |
import gradio as gr
|
|
|
11 |
import pandas as pd
|
12 |
-
import plotly.express as px
|
13 |
-
import plotly.graph_objects as go
|
14 |
from apscheduler.schedulers.background import BackgroundScheduler
|
15 |
-
|
16 |
-
from gradio.themes.utils import fonts, colors
|
17 |
-
from dataclasses import fields, dataclass
|
18 |
|
19 |
from src.about import (
|
20 |
CITATION_BUTTON_LABEL,
|
@@ -26,1091 +14,191 @@ from src.about import (
|
|
26 |
)
|
27 |
from src.display.css_html_js import custom_css
|
28 |
from src.display.utils import (
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
CATEGORIES,
|
35 |
-
COMMENT_LANGUAGES,
|
36 |
-
EXAMPLE_CATEGORIES,
|
37 |
-
TOPICS,
|
38 |
ModelType,
|
39 |
-
|
40 |
-
Precision,
|
41 |
WeightType,
|
42 |
-
|
43 |
-
get_all_column_choices,
|
44 |
-
get_default_visible_columns,
|
45 |
-
)
|
46 |
-
from src.display.formatting import styled_message, styled_error, styled_warning
|
47 |
-
from src.envs import (
|
48 |
-
ADMIN_USERNAME,
|
49 |
-
ADMIN_PASSWORD,
|
50 |
-
RESULTS_DATASET_ID,
|
51 |
-
SUBMITTER_TOKEN,
|
52 |
-
TOKEN,
|
53 |
-
DATA_PATH,
|
54 |
-
)
|
55 |
-
from src.populate import get_leaderboard_df, get_category_leaderboard_df
|
56 |
-
from src.submission.submit import process_submission
|
57 |
-
|
58 |
-
# Configure logging
|
59 |
-
logging.basicConfig(
|
60 |
-
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
|
61 |
)
|
62 |
-
|
|
|
|
|
63 |
|
64 |
-
# Ensure data directory exists
|
65 |
-
os.makedirs(DATA_PATH, exist_ok=True)
|
66 |
|
67 |
-
|
68 |
-
|
69 |
-
CURRENT_VERSION = "v0"
|
70 |
|
71 |
-
|
72 |
try:
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
except Exception as e:
|
77 |
-
logger.error(f"Error loading leaderboard data: {e}")
|
78 |
-
LEADERBOARD_DF = pd.DataFrame()
|
79 |
-
|
80 |
-
custom_theme = gr.themes.Default(
|
81 |
-
primary_hue=colors.slate,
|
82 |
-
secondary_hue=colors.slate,
|
83 |
-
neutral_hue=colors.neutral,
|
84 |
-
font=(fonts.GoogleFont("Inter"), "sans-serif"),
|
85 |
-
).set(
|
86 |
-
# font_size="16px",
|
87 |
-
body_background_fill="#0f0f10",
|
88 |
-
body_background_fill_dark="#0f0f10",
|
89 |
-
body_text_color="#f4f4f5",
|
90 |
-
body_text_color_subdued="#a1a1aa",
|
91 |
-
block_background_fill="#1e1e1e", # Cooler Grey
|
92 |
-
block_border_color="#333333", # Cooler Grey
|
93 |
-
block_shadow="none",
|
94 |
-
# Swapped primary and secondary button styles
|
95 |
-
button_primary_background_fill="#121212", # Changed to specific color for Refresh button
|
96 |
-
button_primary_text_color="#f4f4f5",
|
97 |
-
button_primary_border_color="#333333", # Keep border grey or change to #121212?
|
98 |
-
button_secondary_background_fill="#f4f4f5",
|
99 |
-
button_secondary_text_color="#0f0f10",
|
100 |
-
button_secondary_border_color="#f4f4f5",
|
101 |
-
input_background_fill="#1e1e1e", # Cooler Grey
|
102 |
-
input_border_color="#333333", # Cooler Grey
|
103 |
-
input_placeholder_color="#71717a",
|
104 |
-
table_border_color="#333333", # Cooler Grey
|
105 |
-
table_even_background_fill="#2d2d2d", # Cooler Grey (Slightly lighter)
|
106 |
-
table_odd_background_fill="#1e1e1e", # Cooler Grey
|
107 |
-
table_text_color="#f4f4f5",
|
108 |
-
link_text_color="#ffffff",
|
109 |
-
border_color_primary="#333333", # Cooler Grey
|
110 |
-
background_fill_secondary="#333333", # Cooler Grey
|
111 |
-
color_accent="#f4f4f5",
|
112 |
-
border_color_accent="#333333", # Cooler Grey
|
113 |
-
button_primary_background_fill_hover="#424242", # Cooler Grey
|
114 |
-
block_title_text_color="#f4f4f5",
|
115 |
-
accordion_text_color="#f4f4f5",
|
116 |
-
panel_background_fill="#1e1e1e", # Cooler Grey
|
117 |
-
panel_border_color="#333333", # Cooler Grey
|
118 |
-
# Explicitly setting primary/secondary/accent colors/borders
|
119 |
-
background_fill_primary="#0f0f10",
|
120 |
-
background_fill_primary_dark="#0f0f10",
|
121 |
-
background_fill_secondary_dark="#333333", # Cooler Grey
|
122 |
-
border_color_primary_dark="#333333", # Cooler Grey
|
123 |
-
border_color_accent_dark="#333333", # Cooler Grey
|
124 |
-
border_color_accent_subdued="#424242", # Cooler Grey
|
125 |
-
border_color_accent_subdued_dark="#424242", # Cooler Grey
|
126 |
-
color_accent_soft="#a1a1aa",
|
127 |
-
color_accent_soft_dark="#a1a1aa",
|
128 |
-
# Explicitly setting input hover/focus states
|
129 |
-
input_background_fill_dark="#1e1e1e", # Cooler Grey
|
130 |
-
input_background_fill_focus="#424242", # Cooler Grey
|
131 |
-
input_background_fill_focus_dark="#424242", # Cooler Grey
|
132 |
-
input_background_fill_hover="#2d2d2d", # Cooler Grey
|
133 |
-
input_background_fill_hover_dark="#2d2d2d", # Cooler Grey
|
134 |
-
input_border_color_dark="#333333", # Cooler Grey
|
135 |
-
input_border_color_focus="#f4f4f5",
|
136 |
-
input_border_color_focus_dark="#f4f4f5",
|
137 |
-
input_border_color_hover="#424242", # Cooler Grey
|
138 |
-
input_border_color_hover_dark="#424242", # Cooler Grey
|
139 |
-
input_placeholder_color_dark="#71717a",
|
140 |
-
# Explicitly set dark variants for table backgrounds
|
141 |
-
table_even_background_fill_dark="#2d2d2d", # Cooler Grey
|
142 |
-
table_odd_background_fill_dark="#1e1e1e", # Cooler Grey
|
143 |
-
# Explicitly set dark text variants
|
144 |
-
body_text_color_dark="#f4f4f5",
|
145 |
-
body_text_color_subdued_dark="#a1a1aa",
|
146 |
-
block_title_text_color_dark="#f4f4f5",
|
147 |
-
accordion_text_color_dark="#f4f4f5",
|
148 |
-
table_text_color_dark="#f4f4f5",
|
149 |
-
# Explicitly set dark panel/block variants
|
150 |
-
panel_background_fill_dark="#1e1e1e", # Cooler Grey
|
151 |
-
panel_border_color_dark="#333333", # Cooler Grey
|
152 |
-
block_background_fill_dark="#1e1e1e", # Cooler Grey
|
153 |
-
block_border_color_dark="#333333", # Cooler Grey
|
154 |
-
)
|
155 |
-
|
156 |
-
|
157 |
-
@dataclass
|
158 |
-
class ColumnInfo:
|
159 |
-
"""Information about a column in the leaderboard."""
|
160 |
-
|
161 |
-
name: str
|
162 |
-
display_name: str
|
163 |
-
type: str = "text"
|
164 |
-
hidden: bool = False
|
165 |
-
never_hidden: bool = False
|
166 |
-
displayed_by_default: bool = True
|
167 |
-
|
168 |
-
|
169 |
-
def update_column_choices(df):
|
170 |
-
"""Update column choices based on what's actually in the dataframe"""
|
171 |
-
if df is None or df.empty:
|
172 |
-
return get_all_column_choices()
|
173 |
-
|
174 |
-
# Get columns that actually exist in the dataframe
|
175 |
-
existing_columns = list(df.columns)
|
176 |
-
|
177 |
-
# Get all possible columns with their display names
|
178 |
-
all_columns = get_all_column_choices()
|
179 |
-
|
180 |
-
# Filter to only include columns that exist in the dataframe
|
181 |
-
valid_columns = [
|
182 |
-
(col_name, display_name)
|
183 |
-
for col_name, display_name in all_columns
|
184 |
-
if col_name in existing_columns
|
185 |
-
]
|
186 |
-
|
187 |
-
# Return default if there are no valid columns
|
188 |
-
if not valid_columns:
|
189 |
-
return get_all_column_choices()
|
190 |
-
|
191 |
-
return valid_columns
|
192 |
-
|
193 |
-
|
194 |
-
# Update the column_selector initialization
|
195 |
-
def get_initial_columns():
|
196 |
-
"""Get initial columns to show in the dropdown"""
|
197 |
-
try:
|
198 |
-
# Get available columns in the main dataframe
|
199 |
-
available_cols = list(LEADERBOARD_DF.columns)
|
200 |
-
logger.info(f"Available columns in LEADERBOARD_DF: {available_cols}")
|
201 |
-
|
202 |
-
# If dataframe is empty, use default visible columns
|
203 |
-
if not available_cols:
|
204 |
-
return get_default_visible_columns()
|
205 |
-
|
206 |
-
# Get default visible columns that actually exist in the dataframe
|
207 |
-
valid_defaults = [
|
208 |
-
col for col in get_default_visible_columns() if col in available_cols
|
209 |
-
]
|
210 |
-
|
211 |
-
# If none of the defaults exist, return all available columns
|
212 |
-
if not valid_defaults:
|
213 |
-
return available_cols
|
214 |
-
|
215 |
-
return valid_defaults
|
216 |
-
except Exception as e:
|
217 |
-
logger.error(f"Error getting initial columns: {e}")
|
218 |
-
return get_default_visible_columns()
|
219 |
-
|
220 |
-
|
221 |
-
def init_leaderboard(dataframe, visible_columns=None):
|
222 |
-
"""
|
223 |
-
Initialize a standard Gradio Dataframe component for the leaderboard.
|
224 |
-
"""
|
225 |
-
if dataframe is None or dataframe.empty:
|
226 |
-
# Create an empty dataframe with the right columns
|
227 |
-
columns = [getattr(CODEREVIEW_COLUMN, col).name for col in DISPLAY_COLS]
|
228 |
-
dataframe = pd.DataFrame(columns=columns)
|
229 |
-
logger.warning("Initializing empty leaderboard")
|
230 |
-
|
231 |
-
# Lowercase model_name for display
|
232 |
-
if "model_name" in dataframe.columns:
|
233 |
-
dataframe = dataframe.copy()
|
234 |
-
dataframe["model_name"] = dataframe["model_name"].str.lower()
|
235 |
-
|
236 |
-
if "model_type" in dataframe.columns:
|
237 |
-
dataframe = dataframe.copy()
|
238 |
-
dataframe["model_type"] = dataframe["model_type"].str.replace(" : ", "-")
|
239 |
-
|
240 |
-
if "review_model_type" in dataframe.columns:
|
241 |
-
dataframe = dataframe.copy()
|
242 |
-
dataframe["review_model_type"] = dataframe["review_model_type"].str.replace("custom", "custom")
|
243 |
-
|
244 |
-
# print("\n\n", "dataframe", dataframe, "--------------------------------\n\n")
|
245 |
-
|
246 |
-
# Determine which columns to display
|
247 |
-
display_column_names = [
|
248 |
-
getattr(CODEREVIEW_COLUMN, col).name for col in DISPLAY_COLS
|
249 |
-
]
|
250 |
-
hidden_column_names = [getattr(CODEREVIEW_COLUMN, col).name for col in HIDDEN_COLS]
|
251 |
-
|
252 |
-
# Columns that should always be shown
|
253 |
-
always_visible = [getattr(CODEREVIEW_COLUMN, col).name for col in NEVER_HIDDEN_COLS]
|
254 |
-
|
255 |
-
# Use provided visible columns if specified, otherwise use default
|
256 |
-
if visible_columns is None:
|
257 |
-
# Determine which columns to show initially
|
258 |
-
visible_columns = [
|
259 |
-
col for col in display_column_names if col not in hidden_column_names
|
260 |
-
]
|
261 |
-
|
262 |
-
# Always include the never-hidden columns
|
263 |
-
for col in always_visible:
|
264 |
-
if col not in visible_columns and col in dataframe.columns:
|
265 |
-
visible_columns.append(col)
|
266 |
-
|
267 |
-
# Make sure we only include columns that actually exist in the dataframe
|
268 |
-
visible_columns = [col for col in visible_columns if col in dataframe.columns]
|
269 |
-
|
270 |
-
# Map GuardBench column types to Gradio's expected datatype strings
|
271 |
-
# Valid Gradio datatypes are: 'str', 'number', 'bool', 'date', 'markdown', 'html', 'image'
|
272 |
-
type_mapping = {
|
273 |
-
"text": "str",
|
274 |
-
"number": "number",
|
275 |
-
"bool": "bool",
|
276 |
-
"date": "date",
|
277 |
-
"markdown": "markdown",
|
278 |
-
"html": "html",
|
279 |
-
"image": "image",
|
280 |
-
}
|
281 |
-
|
282 |
-
# Create a list of datatypes in the format Gradio expects
|
283 |
-
datatypes = []
|
284 |
-
for col in visible_columns:
|
285 |
-
# Find the corresponding CODEREVIEW_COLUMN entry
|
286 |
-
col_type = None
|
287 |
-
for display_col in DISPLAY_COLS:
|
288 |
-
if getattr(CODEREVIEW_COLUMN, display_col).name == col:
|
289 |
-
orig_type = getattr(CODEREVIEW_COLUMN, display_col).type
|
290 |
-
# Map to Gradio's expected types
|
291 |
-
col_type = type_mapping.get(orig_type, "str")
|
292 |
-
break
|
293 |
-
|
294 |
-
# Default to 'str' if type not found or not mappable
|
295 |
-
if col_type is None:
|
296 |
-
col_type = "str"
|
297 |
-
|
298 |
-
datatypes.append(col_type)
|
299 |
-
|
300 |
-
# Create a dummy column for search functionality if it doesn't exist
|
301 |
-
if "search_dummy" not in dataframe.columns:
|
302 |
-
dataframe["search_dummy"] = dataframe.apply(
|
303 |
-
lambda row: " ".join(str(val) for val in row.values if pd.notna(val)),
|
304 |
-
axis=1,
|
305 |
-
)
|
306 |
-
|
307 |
-
# Select only the visible columns for display
|
308 |
-
visible_columns.remove("model_name")
|
309 |
-
|
310 |
-
visible_columns = ["model_name"] + visible_columns
|
311 |
-
display_df = dataframe[visible_columns].copy()
|
312 |
-
|
313 |
-
# print(f"--- DataFrame inside init_leaderboard (before rounding) ---")
|
314 |
-
# print(display_df[['model_name', 'macro_accuracy', 'macro_recall', 'total_evals_count']].head() if all(c in display_df.columns for c in ['model_name', 'macro_accuracy', 'macro_recall', 'total_evals_count']) else "Relevant columns not present")
|
315 |
-
# print(f"-------------------------------------------------------------")
|
316 |
-
|
317 |
-
# Round numeric columns to 3 decimal places for display
|
318 |
-
numeric_cols = display_df.select_dtypes(include=np.number).columns
|
319 |
-
for col in numeric_cols:
|
320 |
-
# Avoid rounding integer columns like counts
|
321 |
-
if not pd.api.types.is_integer_dtype(display_df[col]):
|
322 |
-
# Format floats to exactly 3 decimal places, preserving trailing zeros
|
323 |
-
display_df[col] = display_df[col].apply(
|
324 |
-
lambda x: f"{x:.3f}" if pd.notna(x) else None
|
325 |
-
)
|
326 |
-
|
327 |
-
column_info_map = {
|
328 |
-
f.name: getattr(CODEREVIEW_COLUMN, f.name) for f in fields(CODEREVIEW_COLUMN)
|
329 |
-
}
|
330 |
-
column_mapping = {
|
331 |
-
col: column_info_map.get(col, ColumnInfo(col, col)).display_name
|
332 |
-
for col in visible_columns
|
333 |
-
}
|
334 |
-
|
335 |
-
# Rename columns in the DataFrame
|
336 |
-
display_df.rename(columns=column_mapping, inplace=True)
|
337 |
-
|
338 |
-
# Apply styling - note: styling might need adjustment if it relies on column names
|
339 |
-
styler = display_df.style.set_properties(**{"text-align": "right"}).set_properties(
|
340 |
-
subset=["Model"], **{"width": "200px"}
|
341 |
)
|
342 |
-
|
343 |
-
|
344 |
-
|
345 |
-
|
346 |
-
|
347 |
-
|
348 |
-
height=2500,
|
349 |
-
elem_id="leaderboard-table",
|
350 |
-
row_count=len(display_df),
|
351 |
)
|
|
|
|
|
352 |
|
353 |
|
354 |
-
|
355 |
-
df, search_query="", comment_languages=None, version=CURRENT_VERSION
|
356 |
-
):
|
357 |
-
"""
|
358 |
-
Filter the leaderboard based on search query and comment languages.
|
359 |
-
"""
|
360 |
-
if df is None or df.empty:
|
361 |
-
return df
|
362 |
-
|
363 |
-
filtered_df = df.copy()
|
364 |
-
|
365 |
-
# Add search dummy column if it doesn't exist
|
366 |
-
if "search_dummy" not in filtered_df.columns:
|
367 |
-
filtered_df["search_dummy"] = filtered_df.apply(
|
368 |
-
lambda row: " ".join(str(val) for val in row.values if pd.notna(val)),
|
369 |
-
axis=1,
|
370 |
-
)
|
371 |
-
|
372 |
-
# Apply comment language filter (assuming there's a comment_language column in the data)
|
373 |
-
if comment_languages and len(comment_languages) > 0:
|
374 |
-
# Look for a comment language column in the dataframe
|
375 |
-
comment_lang_cols = [col for col in filtered_df.columns if 'comment_language' in col.lower()]
|
376 |
-
if comment_lang_cols:
|
377 |
-
filtered_df = filtered_df[
|
378 |
-
filtered_df[comment_lang_cols[0]].isin(comment_languages)
|
379 |
-
]
|
380 |
-
|
381 |
-
# Apply search query
|
382 |
-
if search_query:
|
383 |
-
search_terms = [
|
384 |
-
term.strip() for term in search_query.split(";") if term.strip()
|
385 |
-
]
|
386 |
-
if search_terms:
|
387 |
-
combined_mask = None
|
388 |
-
for term in search_terms:
|
389 |
-
mask = filtered_df["search_dummy"].str.contains(
|
390 |
-
term, case=False, na=False
|
391 |
-
)
|
392 |
-
if combined_mask is None:
|
393 |
-
combined_mask = mask
|
394 |
-
else:
|
395 |
-
combined_mask = combined_mask | mask
|
396 |
-
|
397 |
-
if combined_mask is not None:
|
398 |
-
filtered_df = filtered_df[combined_mask]
|
399 |
-
|
400 |
-
# Drop the search dummy column before returning
|
401 |
-
visible_columns = [col for col in filtered_df.columns if col != "search_dummy"]
|
402 |
-
return filtered_df[visible_columns]
|
403 |
-
|
404 |
-
|
405 |
-
def refresh_data_with_filters(
|
406 |
-
version=CURRENT_VERSION, search_query="", comment_languages=None, selected_columns=None
|
407 |
-
):
|
408 |
-
"""
|
409 |
-
Refresh the leaderboard data and update all components with filtering.
|
410 |
-
Ensures we handle cases where dataframes might have limited columns.
|
411 |
-
"""
|
412 |
-
global LEADERBOARD_DF
|
413 |
-
try:
|
414 |
-
logger.info(f"Performing refresh of leaderboard data with filters...")
|
415 |
-
# Get new data
|
416 |
-
main_df = get_leaderboard_df(version=version)
|
417 |
-
LEADERBOARD_DF = main_df
|
418 |
-
category_dfs = [
|
419 |
-
get_category_leaderboard_df(category, version=version)
|
420 |
-
for category in CATEGORIES
|
421 |
-
]
|
422 |
-
selected_columns = [
|
423 |
-
x.lower()
|
424 |
-
.replace(" ", "_")
|
425 |
-
.replace("(", "")
|
426 |
-
.replace(")", "")
|
427 |
-
.replace("_recall", "_recall_binary")
|
428 |
-
.replace("_precision", "_precision_binary")
|
429 |
-
for x in selected_columns
|
430 |
-
]
|
431 |
-
|
432 |
-
# Log the actual columns we have
|
433 |
-
logger.info(f"Main dataframe columns: {list(main_df.columns)}")
|
434 |
-
|
435 |
-
# Apply filters to each dataframe
|
436 |
-
filtered_main_df = search_filter_leaderboard(
|
437 |
-
main_df, search_query, comment_languages, version
|
438 |
-
)
|
439 |
-
filtered_category_dfs = [
|
440 |
-
search_filter_leaderboard(df, search_query, comment_languages, version)
|
441 |
-
for df in category_dfs
|
442 |
-
]
|
443 |
-
|
444 |
-
# Get available columns from the dataframe
|
445 |
-
available_columns = list(filtered_main_df.columns)
|
446 |
-
|
447 |
-
# Filter selected columns to only those available in the data
|
448 |
-
if selected_columns:
|
449 |
-
# Convert display names to internal names first
|
450 |
-
internal_selected_columns = [
|
451 |
-
x.lower()
|
452 |
-
.replace(" ", "_")
|
453 |
-
.replace("(", "")
|
454 |
-
.replace(")", "")
|
455 |
-
.replace("_recall", "_recall_binary")
|
456 |
-
.replace("_precision", "_precision_binary")
|
457 |
-
for x in selected_columns
|
458 |
-
]
|
459 |
-
valid_selected_columns = [
|
460 |
-
col for col in internal_selected_columns if col in available_columns
|
461 |
-
]
|
462 |
-
if not valid_selected_columns and "model_name" in available_columns:
|
463 |
-
# Fallback if conversion/filtering leads to empty selection
|
464 |
-
valid_selected_columns = ["model_name"] + [
|
465 |
-
col
|
466 |
-
for col in get_default_visible_columns()
|
467 |
-
if col in available_columns
|
468 |
-
]
|
469 |
-
else:
|
470 |
-
# If no columns were selected in the dropdown, use default visible columns that exist
|
471 |
-
valid_selected_columns = [
|
472 |
-
col for col in get_default_visible_columns() if col in available_columns
|
473 |
-
]
|
474 |
-
|
475 |
-
# Initialize dataframes for display with valid selected columns
|
476 |
-
main_dataframe = init_leaderboard(filtered_main_df, valid_selected_columns)
|
477 |
-
|
478 |
-
# For category dataframes, get columns that actually exist in each one
|
479 |
-
category_dataframes = []
|
480 |
-
for df in filtered_category_dfs:
|
481 |
-
df_columns = list(df.columns)
|
482 |
-
df_valid_columns = [
|
483 |
-
col for col in valid_selected_columns if col in df_columns
|
484 |
-
]
|
485 |
-
if not df_valid_columns and "model_name" in df_columns:
|
486 |
-
df_valid_columns = ["model_name"] + get_default_visible_columns()
|
487 |
-
category_dataframes.append(init_leaderboard(df, df_valid_columns))
|
488 |
-
|
489 |
-
return main_dataframe, *category_dataframes
|
490 |
-
|
491 |
-
except Exception as e:
|
492 |
-
logger.error(f"Error in refresh with filters: {e}")
|
493 |
-
# Return the current leaderboards on error
|
494 |
-
return leaderboard, *[
|
495 |
-
tab.children[0] for tab in category_tabs.children[1 : len(CATEGORIES) + 1]
|
496 |
-
]
|
497 |
-
|
498 |
-
|
499 |
-
def submit_results(
|
500 |
-
model_name: str,
|
501 |
-
base_model: str,
|
502 |
-
revision: str,
|
503 |
-
precision: str,
|
504 |
-
weight_type: str,
|
505 |
-
model_type: str,
|
506 |
-
mode: str,
|
507 |
-
submission_file: tempfile._TemporaryFileWrapper,
|
508 |
-
version: str,
|
509 |
-
review_model_type: ReviewModelType,
|
510 |
-
programming_language: str,
|
511 |
-
comment_language: str,
|
512 |
-
):
|
513 |
-
"""
|
514 |
-
Handle submission of results with model metadata.
|
515 |
-
"""
|
516 |
-
if submission_file is None:
|
517 |
-
return styled_error("No submission file provided")
|
518 |
-
|
519 |
-
if not model_name:
|
520 |
-
return styled_error("Model name is required")
|
521 |
-
|
522 |
-
if not model_type:
|
523 |
-
return styled_error("Please select a model type")
|
524 |
-
|
525 |
-
if not mode:
|
526 |
-
return styled_error("Please select an inference mode")
|
527 |
-
|
528 |
-
file_path = submission_file.name
|
529 |
-
logger.info(f"Received submission for model {model_name}: {file_path}")
|
530 |
-
|
531 |
-
# Add metadata to the submission
|
532 |
-
metadata = {
|
533 |
-
"model_name": model_name,
|
534 |
-
"base_model": base_model,
|
535 |
-
"revision": revision if revision else "main",
|
536 |
-
"precision": precision,
|
537 |
-
"weight_type": weight_type,
|
538 |
-
"model_type": model_type,
|
539 |
-
"mode": mode,
|
540 |
-
"version": version,
|
541 |
-
"review_model_type": review_model_type,
|
542 |
-
"programming_language": programming_language,
|
543 |
-
"comment_language": comment_language,
|
544 |
-
}
|
545 |
-
|
546 |
-
# Process the submission
|
547 |
-
result = process_submission(file_path, metadata, version=version)
|
548 |
-
|
549 |
-
# Refresh the leaderboard data
|
550 |
-
global LEADERBOARD_DF
|
551 |
-
try:
|
552 |
-
logger.info(
|
553 |
-
f"Refreshing leaderboard data after submission for version {version}..."
|
554 |
-
)
|
555 |
-
LEADERBOARD_DF = get_leaderboard_df(version=version)
|
556 |
-
logger.info("Refreshed leaderboard data after submission")
|
557 |
-
except Exception as e:
|
558 |
-
logger.error(f"Error refreshing leaderboard data: {e}")
|
559 |
-
|
560 |
-
return result
|
561 |
-
|
562 |
-
|
563 |
-
def refresh_data(version=CURRENT_VERSION):
|
564 |
-
"""
|
565 |
-
Refresh the leaderboard data and update all components.
|
566 |
-
"""
|
567 |
-
try:
|
568 |
-
logger.info(f"Performing scheduled refresh of leaderboard data...")
|
569 |
-
# Get new data
|
570 |
-
main_df = get_leaderboard_df(version=version)
|
571 |
-
category_dfs = [
|
572 |
-
get_category_leaderboard_df(category, version=version)
|
573 |
-
for category in CATEGORIES
|
574 |
-
]
|
575 |
-
|
576 |
-
# For gr.Dataframe, we return the actual dataframes
|
577 |
-
return main_df, *category_dfs
|
578 |
-
|
579 |
-
except Exception as e:
|
580 |
-
logger.error(f"Error in scheduled refresh: {e}")
|
581 |
-
return None, *[None for _ in CATEGORIES]
|
582 |
-
|
583 |
|
584 |
-
|
585 |
-
|
586 |
-
|
587 |
-
|
588 |
-
|
589 |
-
new_df = get_leaderboard_df(version=version)
|
590 |
-
category_dfs = [
|
591 |
-
get_category_leaderboard_df(category, version=version)
|
592 |
-
for category in CATEGORIES
|
593 |
-
]
|
594 |
-
return new_df, *category_dfs
|
595 |
-
except Exception as e:
|
596 |
-
logger.error(f"Error updating leaderboards for version {version}: {e}")
|
597 |
-
return None, *[None for _ in CATEGORIES]
|
598 |
|
599 |
-
|
600 |
-
|
601 |
-
|
602 |
-
|
603 |
-
|
604 |
-
|
605 |
-
|
606 |
-
|
607 |
-
|
608 |
-
|
609 |
-
|
610 |
-
|
611 |
-
|
612 |
-
|
613 |
-
|
614 |
-
|
615 |
-
|
616 |
-
|
617 |
-
|
618 |
-
|
619 |
-
|
620 |
-
|
621 |
-
colors = ["#8FCCCC", "#C2A4B6", "#98B4A6", "#B68F7C"]
|
622 |
-
for idx, model in enumerate(selected_models):
|
623 |
-
model_data = df[df["model_name"] == model]
|
624 |
-
if not model_data.empty:
|
625 |
-
values = model_data[metric_cols].values[0].tolist()
|
626 |
-
values = values + [values[0]]
|
627 |
-
categories = [col.replace(f"_{metric}", "") for col in metric_cols]
|
628 |
-
# Replace 'jailbreaked' with 'jailbroken' in categories
|
629 |
-
categories = [cat.replace('jailbreaked', 'jailbroken') for cat in categories]
|
630 |
-
categories = categories + [categories[0]]
|
631 |
-
fig.add_trace(
|
632 |
-
go.Scatterpolar(
|
633 |
-
r=values,
|
634 |
-
theta=categories,
|
635 |
-
name=model,
|
636 |
-
line_color=colors[idx % len(colors)],
|
637 |
-
fill="toself",
|
638 |
-
)
|
639 |
-
)
|
640 |
-
fig.update_layout(
|
641 |
-
paper_bgcolor="#000000",
|
642 |
-
plot_bgcolor="#000000",
|
643 |
-
font={"color": "#ffffff"},
|
644 |
-
title={
|
645 |
-
"text": f"{category} - {metric.upper()} Score Comparison",
|
646 |
-
"font": {"color": "#ffffff", "size": 24},
|
647 |
-
},
|
648 |
-
polar=dict(
|
649 |
-
bgcolor="#000000",
|
650 |
-
radialaxis=dict(
|
651 |
-
visible=True,
|
652 |
-
range=[0, 1],
|
653 |
-
gridcolor="#333333",
|
654 |
-
linecolor="#333333",
|
655 |
-
tickfont={"color": "#ffffff"},
|
656 |
),
|
657 |
-
|
658 |
-
|
659 |
-
linecolor="#333333",
|
660 |
-
tickfont={"color": "#ffffff"},
|
661 |
),
|
662 |
-
|
663 |
-
|
664 |
-
|
665 |
-
legend=dict(
|
666 |
-
yanchor="top",
|
667 |
-
y=0.99,
|
668 |
-
xanchor="right",
|
669 |
-
x=0.99,
|
670 |
-
bgcolor="rgba(0,0,0,0.5)",
|
671 |
-
font={"color": "#ffffff"},
|
672 |
-
),
|
673 |
-
)
|
674 |
-
return fig
|
675 |
-
|
676 |
-
|
677 |
-
def update_model_choices(version):
|
678 |
-
"""
|
679 |
-
Update the list of available models for the given version.
|
680 |
-
"""
|
681 |
-
df = get_leaderboard_df(version=version)
|
682 |
-
if df.empty:
|
683 |
-
return []
|
684 |
-
return sorted(df["model_name"].str.lower().unique().tolist())
|
685 |
-
|
686 |
-
|
687 |
-
def update_visualization(selected_models, selected_category, selected_metric, version):
|
688 |
-
"""
|
689 |
-
Update the visualization based on user selections.
|
690 |
-
"""
|
691 |
-
if not selected_models:
|
692 |
-
return go.Figure()
|
693 |
-
return create_performance_plot(
|
694 |
-
selected_models, selected_category, selected_metric, version
|
695 |
)
|
696 |
|
697 |
|
698 |
-
|
699 |
-
demo = gr.Blocks(css=custom_css, theme=custom_theme)
|
700 |
-
|
701 |
-
CATEGORY_DISPLAY_MAP = {
|
702 |
-
"Python": "Python",
|
703 |
-
"Java": "Java",
|
704 |
-
"Scala": "Scala",
|
705 |
-
"Go": "Go"
|
706 |
-
}
|
707 |
-
# Create reverse mapping for lookups
|
708 |
-
CATEGORY_REVERSE_MAP = {v: k for k, v in CATEGORY_DISPLAY_MAP.items()}
|
709 |
-
|
710 |
with demo:
|
711 |
gr.HTML(TITLE)
|
712 |
-
|
713 |
-
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
714 |
|
715 |
-
with gr.
|
716 |
-
|
|
|
717 |
|
718 |
-
with
|
719 |
-
|
720 |
-
with gr.Row():
|
721 |
-
version_selector = gr.Dropdown(
|
722 |
-
choices=BENCHMARK_VERSIONS,
|
723 |
-
label="Benchmark Version",
|
724 |
-
value=CURRENT_VERSION,
|
725 |
-
interactive=True,
|
726 |
-
elem_classes="version-selector",
|
727 |
-
scale=1,
|
728 |
-
visible=False,
|
729 |
-
)
|
730 |
|
|
|
|
|
731 |
with gr.Row():
|
732 |
-
|
733 |
-
|
734 |
-
|
735 |
-
|
736 |
-
|
737 |
-
|
738 |
-
|
739 |
-
|
740 |
-
|
741 |
-
|
742 |
-
|
743 |
-
|
744 |
-
|
745 |
-
)
|
746 |
-
programming_language_filter = gr.Dropdown(
|
747 |
-
choices=["Python", "Java", "Scala", "Go"],
|
748 |
-
label="Programming Language",
|
749 |
-
multiselect=True,
|
750 |
-
value=[],
|
751 |
-
interactive=True,
|
752 |
-
scale=1,
|
753 |
-
)
|
754 |
-
with gr.Row():
|
755 |
-
topic_filter = gr.Dropdown(
|
756 |
-
choices=TOPICS,
|
757 |
-
label="Topic",
|
758 |
-
multiselect=True,
|
759 |
-
value=[],
|
760 |
-
interactive=True,
|
761 |
-
scale=2,
|
762 |
-
)
|
763 |
-
column_selector = gr.Dropdown(
|
764 |
-
choices=get_all_column_choices(),
|
765 |
-
label="Columns",
|
766 |
-
multiselect=True,
|
767 |
-
value=get_initial_columns(),
|
768 |
-
interactive=True,
|
769 |
-
visible=False,
|
770 |
-
scale=1,
|
771 |
-
)
|
772 |
-
with gr.Row():
|
773 |
-
refresh_button = gr.Button(
|
774 |
-
"Refresh", scale=0, elem_id="refresh-button"
|
775 |
-
)
|
776 |
-
|
777 |
-
# Create tabs for each category
|
778 |
-
with gr.Tabs(elem_classes="category-tabs") as category_tabs:
|
779 |
-
# First tab for average metrics across all categories
|
780 |
-
with gr.TabItem("All Results", elem_id="overall-tab"):
|
781 |
-
leaderboard = init_leaderboard(LEADERBOARD_DF)
|
782 |
-
|
783 |
-
# Create a tab for each category using display names
|
784 |
-
for category in CATEGORIES:
|
785 |
-
display_name = CATEGORY_DISPLAY_MAP.get(category, category)
|
786 |
-
elem_id = f"category-{display_name.lower().replace(' ', '-').replace('&', 'and')}-tab"
|
787 |
-
with gr.TabItem(display_name, elem_id=elem_id):
|
788 |
-
category_df = get_category_leaderboard_df(
|
789 |
-
category, version=CURRENT_VERSION
|
790 |
-
)
|
791 |
-
category_leaderboard = init_leaderboard(category_df)
|
792 |
-
|
793 |
-
# Connect search and filter inputs to update function
|
794 |
-
def update_with_search_filters(
|
795 |
-
version=CURRENT_VERSION,
|
796 |
-
search_query="",
|
797 |
-
comment_languages=None,
|
798 |
-
selected_columns=None,
|
799 |
-
):
|
800 |
-
"""
|
801 |
-
Update the leaderboards with search and filter settings.
|
802 |
-
"""
|
803 |
-
return refresh_data_with_filters(
|
804 |
-
version, search_query, comment_languages, selected_columns
|
805 |
-
)
|
806 |
-
|
807 |
-
# Refresh button functionality
|
808 |
-
def refresh_and_update(
|
809 |
-
version, search_query, comment_languages, selected_columns
|
810 |
-
):
|
811 |
-
"""
|
812 |
-
Refresh data, update LEADERBOARD_DF, and return updated components.
|
813 |
-
"""
|
814 |
-
global LEADERBOARD_DF
|
815 |
-
main_df = get_leaderboard_df(version=version)
|
816 |
-
LEADERBOARD_DF = main_df # Update the global DataFrame
|
817 |
-
return refresh_data_with_filters(
|
818 |
-
version, search_query, comment_languages, selected_columns
|
819 |
-
)
|
820 |
-
|
821 |
-
refresh_button.click(
|
822 |
-
fn=refresh_and_update,
|
823 |
-
inputs=[
|
824 |
-
version_selector,
|
825 |
-
search_input,
|
826 |
-
comment_language_filter,
|
827 |
-
column_selector,
|
828 |
-
],
|
829 |
-
outputs=[leaderboard]
|
830 |
-
+ [
|
831 |
-
category_tabs.children[i].children[0]
|
832 |
-
for i in range(1, len(CATEGORIES) + 1)
|
833 |
-
],
|
834 |
-
)
|
835 |
-
# Search input functionality
|
836 |
-
search_input.change(
|
837 |
-
fn=refresh_data_with_filters,
|
838 |
-
inputs=[
|
839 |
-
version_selector,
|
840 |
-
search_input,
|
841 |
-
comment_language_filter,
|
842 |
-
column_selector,
|
843 |
-
],
|
844 |
-
outputs=[leaderboard]
|
845 |
-
+ [
|
846 |
-
category_tabs.children[i].children[0]
|
847 |
-
for i in range(1, len(CATEGORIES) + 1)
|
848 |
-
],
|
849 |
-
)
|
850 |
-
|
851 |
-
# Comment language filter functionality
|
852 |
-
comment_language_filter.change(
|
853 |
-
fn=refresh_data_with_filters,
|
854 |
-
inputs=[
|
855 |
-
version_selector,
|
856 |
-
search_input,
|
857 |
-
comment_language_filter,
|
858 |
-
column_selector,
|
859 |
-
],
|
860 |
-
outputs=[leaderboard]
|
861 |
-
+ [
|
862 |
-
category_tabs.children[i].children[0]
|
863 |
-
for i in range(1, len(CATEGORIES) + 1)
|
864 |
-
],
|
865 |
-
)
|
866 |
-
|
867 |
-
# Version selector functionality
|
868 |
-
version_selector.change(
|
869 |
-
fn=refresh_data_with_filters,
|
870 |
-
inputs=[
|
871 |
-
version_selector,
|
872 |
-
search_input,
|
873 |
-
comment_language_filter,
|
874 |
-
column_selector,
|
875 |
-
],
|
876 |
-
outputs=[leaderboard]
|
877 |
-
+ [
|
878 |
-
category_tabs.children[i].children[0]
|
879 |
-
for i in range(1, len(CATEGORIES) + 1)
|
880 |
-
],
|
881 |
-
)
|
882 |
-
|
883 |
-
# Update the update_columns function to handle updating all tabs at once
|
884 |
-
def update_columns(selected_columns):
|
885 |
-
"""
|
886 |
-
Update all leaderboards to show the selected columns.
|
887 |
-
Ensures all selected columns are preserved in the update.
|
888 |
-
|
889 |
-
"""
|
890 |
-
|
891 |
-
try:
|
892 |
-
logger.info(f"Updating columns to show: {selected_columns}")
|
893 |
-
|
894 |
-
# If no columns are selected, use default visible columns
|
895 |
-
if not selected_columns or len(selected_columns) == 0:
|
896 |
-
selected_columns = get_default_visible_columns()
|
897 |
-
logger.info(
|
898 |
-
f"No columns selected, using defaults: {selected_columns}"
|
899 |
)
|
900 |
-
|
901 |
-
|
902 |
-
|
903 |
-
|
904 |
-
|
905 |
-
.
|
906 |
-
|
907 |
-
|
908 |
-
|
909 |
-
|
910 |
-
]
|
911 |
-
|
912 |
-
# Get the current data with ALL columns preserved
|
913 |
-
main_df = get_leaderboard_df(version=version_selector.value)
|
914 |
-
|
915 |
-
# Get category dataframes with ALL columns preserved
|
916 |
-
category_dfs = [
|
917 |
-
get_category_leaderboard_df(
|
918 |
-
category, version=version_selector.value
|
919 |
)
|
920 |
-
for category in CATEGORIES
|
921 |
-
]
|
922 |
|
923 |
-
|
924 |
-
|
925 |
-
|
926 |
-
|
927 |
-
)
|
928 |
-
|
929 |
-
|
930 |
-
|
931 |
-
|
932 |
-
|
933 |
-
):
|
934 |
-
internal_selected_columns = [
|
935 |
-
"model_name"
|
936 |
-
] + internal_selected_columns
|
937 |
-
|
938 |
-
# Initialize the main leaderboard with the selected columns
|
939 |
-
# We're passing the internal_selected_columns directly to preserve the selection
|
940 |
-
main_leaderboard = init_leaderboard(
|
941 |
-
main_df, internal_selected_columns
|
942 |
-
)
|
943 |
-
|
944 |
-
# Initialize category dataframes with the same selected columns
|
945 |
-
# This ensures consistency across all tabs
|
946 |
-
category_leaderboards = []
|
947 |
-
for df in category_dfs:
|
948 |
-
# Use the same selected columns for each category
|
949 |
-
# init_leaderboard will automatically handle filtering to columns that exist
|
950 |
-
category_leaderboards.append(
|
951 |
-
init_leaderboard(df, internal_selected_columns)
|
952 |
)
|
953 |
-
|
954 |
-
|
955 |
-
|
956 |
-
|
957 |
-
|
958 |
-
|
959 |
-
|
960 |
-
|
961 |
-
|
962 |
-
|
963 |
-
|
964 |
-
|
965 |
-
|
966 |
-
# Connect column selector to update function
|
967 |
-
column_selector.change(
|
968 |
-
fn=update_columns,
|
969 |
-
inputs=[column_selector],
|
970 |
-
outputs=[leaderboard]
|
971 |
-
+ [
|
972 |
-
category_tabs.children[i].children[0]
|
973 |
-
for i in range(1, len(CATEGORIES) + 1)
|
974 |
-
],
|
975 |
-
)
|
976 |
-
|
977 |
-
# with gr.TabItem("About", elem_id="codereview-about-tab", id=2):
|
978 |
-
# gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
979 |
-
|
980 |
-
with gr.TabItem("Submit", elem_id="codereview-submit-tab", id=1):
|
981 |
-
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
982 |
-
|
983 |
-
with gr.Row():
|
984 |
-
# with gr.Column(scale=3):
|
985 |
-
# gr.Markdown("# ✉️✨ Submit your results here!", elem_classes="markdown-text")
|
986 |
-
with gr.Column(scale=1):
|
987 |
-
# Add version selector specifically for the submission tab
|
988 |
-
submission_version_selector = gr.Dropdown(
|
989 |
-
choices=BENCHMARK_VERSIONS,
|
990 |
-
label="Benchmark Version",
|
991 |
-
value=CURRENT_VERSION,
|
992 |
-
interactive=True,
|
993 |
-
elem_classes="version-selector",
|
994 |
-
visible=False,
|
995 |
-
)
|
996 |
-
|
997 |
-
with gr.Row():
|
998 |
-
with gr.Column():
|
999 |
-
model_name_textbox = gr.Textbox(label="Model name")
|
1000 |
-
mode_selector = gr.Dropdown(
|
1001 |
-
choices=[m.name for m in Mode],
|
1002 |
-
label="Mode",
|
1003 |
-
multiselect=False,
|
1004 |
-
value=None,
|
1005 |
-
interactive=True,
|
1006 |
-
)
|
1007 |
-
revision_name_textbox = gr.Textbox(
|
1008 |
-
label="Revision commit", placeholder="main"
|
1009 |
-
)
|
1010 |
-
model_type = gr.Dropdown(
|
1011 |
-
choices=[
|
1012 |
-
t.to_str("-")
|
1013 |
-
for t in ModelType
|
1014 |
-
if t != ModelType.Unknown and t != ModelType.ClosedSource
|
1015 |
-
],
|
1016 |
-
label="Model type",
|
1017 |
-
multiselect=False,
|
1018 |
-
value=None,
|
1019 |
-
interactive=True,
|
1020 |
-
)
|
1021 |
-
review_model_type = gr.Dropdown(
|
1022 |
-
choices=[t.name for t in ReviewModelType],
|
1023 |
-
label="Review model type",
|
1024 |
-
multiselect=False,
|
1025 |
-
value=ReviewModelType.CUSTOM.name,
|
1026 |
-
interactive=True,
|
1027 |
-
)
|
1028 |
-
programming_language_selector = gr.Dropdown(
|
1029 |
-
choices=["Python", "Java", "Scala", "Go"],
|
1030 |
-
label="Programming Language",
|
1031 |
-
multiselect=False,
|
1032 |
-
value=None,
|
1033 |
-
interactive=True,
|
1034 |
-
)
|
1035 |
-
comment_language_selector = gr.Dropdown(
|
1036 |
-
choices=["en", "ru"],
|
1037 |
-
label="Comment Language",
|
1038 |
-
multiselect=False,
|
1039 |
-
value="en",
|
1040 |
-
interactive=True,
|
1041 |
-
)
|
1042 |
-
|
1043 |
-
with gr.Column():
|
1044 |
-
precision = gr.Dropdown(
|
1045 |
-
choices=[
|
1046 |
-
i.name for i in Precision if i != Precision.Unknown
|
1047 |
-
],
|
1048 |
-
label="Precision",
|
1049 |
-
multiselect=False,
|
1050 |
-
value="float16",
|
1051 |
-
interactive=True,
|
1052 |
-
)
|
1053 |
-
weight_type = gr.Dropdown(
|
1054 |
-
choices=[i.name for i in WeightType],
|
1055 |
-
label="Weights type",
|
1056 |
-
multiselect=False,
|
1057 |
-
value="Original",
|
1058 |
-
interactive=True,
|
1059 |
-
)
|
1060 |
-
base_model_name_textbox = gr.Textbox(
|
1061 |
-
label="Base model (for delta or adapter weights)"
|
1062 |
-
)
|
1063 |
-
|
1064 |
-
with gr.Row():
|
1065 |
-
file_input = gr.File(
|
1066 |
-
label="Upload JSONL Results File", file_types=[".jsonl"]
|
1067 |
)
|
1068 |
|
1069 |
-
|
1070 |
-
|
1071 |
-
|
1072 |
-
|
1073 |
-
|
1074 |
-
|
1075 |
-
|
1076 |
-
|
1077 |
-
|
1078 |
-
|
1079 |
-
|
1080 |
-
|
1081 |
-
|
1082 |
-
|
1083 |
-
|
1084 |
-
|
1085 |
-
|
1086 |
-
|
1087 |
-
|
1088 |
-
|
1089 |
-
|
1090 |
-
|
1091 |
-
|
1092 |
-
|
1093 |
-
|
1094 |
-
|
1095 |
-
|
1096 |
-
|
1097 |
-
|
1098 |
-
|
1099 |
-
|
1100 |
-
lambda version: refresh_data_with_filters(version),
|
1101 |
-
inputs=[version_selector],
|
1102 |
-
outputs=[leaderboard]
|
1103 |
-
+ [
|
1104 |
-
category_tabs.children[i].children[0] for i in range(1, len(CATEGORIES) + 1)
|
1105 |
-
],
|
1106 |
-
)
|
1107 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1108 |
|
1109 |
-
# Set up the scheduler to refresh data periodically
|
1110 |
scheduler = BackgroundScheduler()
|
1111 |
-
scheduler.add_job(
|
1112 |
scheduler.start()
|
1113 |
-
|
1114 |
-
# Launch the app
|
1115 |
-
if __name__ == "__main__":
|
1116 |
-
demo.launch()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
+
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
|
3 |
import pandas as pd
|
|
|
|
|
4 |
from apscheduler.schedulers.background import BackgroundScheduler
|
5 |
+
from huggingface_hub import snapshot_download
|
|
|
|
|
6 |
|
7 |
from src.about import (
|
8 |
CITATION_BUTTON_LABEL,
|
|
|
14 |
)
|
15 |
from src.display.css_html_js import custom_css
|
16 |
from src.display.utils import (
|
17 |
+
BENCHMARK_COLS,
|
18 |
+
COLS,
|
19 |
+
EVAL_COLS,
|
20 |
+
EVAL_TYPES,
|
21 |
+
AutoEvalColumn,
|
|
|
|
|
|
|
|
|
22 |
ModelType,
|
23 |
+
fields,
|
|
|
24 |
WeightType,
|
25 |
+
Precision
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
)
|
27 |
+
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
28 |
+
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
29 |
+
from src.submission.submit import add_new_eval
|
30 |
|
|
|
|
|
31 |
|
32 |
+
def restart_space():
|
33 |
+
API.restart_space(repo_id=REPO_ID)
|
|
|
34 |
|
35 |
+
### Space initialisation
|
36 |
try:
|
37 |
+
print(EVAL_REQUESTS_PATH)
|
38 |
+
snapshot_download(
|
39 |
+
repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
)
|
41 |
+
except Exception:
|
42 |
+
restart_space()
|
43 |
+
try:
|
44 |
+
print(EVAL_RESULTS_PATH)
|
45 |
+
snapshot_download(
|
46 |
+
repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
|
|
|
|
|
|
47 |
)
|
48 |
+
except Exception:
|
49 |
+
restart_space()
|
50 |
|
51 |
|
52 |
+
LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
|
54 |
+
(
|
55 |
+
finished_eval_queue_df,
|
56 |
+
running_eval_queue_df,
|
57 |
+
pending_eval_queue_df,
|
58 |
+
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
|
60 |
+
def init_leaderboard(dataframe):
|
61 |
+
if dataframe is None or dataframe.empty:
|
62 |
+
raise ValueError("Leaderboard DataFrame is empty or None.")
|
63 |
+
return Leaderboard(
|
64 |
+
value=dataframe,
|
65 |
+
datatype=[c.type for c in fields(AutoEvalColumn)],
|
66 |
+
select_columns=SelectColumns(
|
67 |
+
default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
|
68 |
+
cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
|
69 |
+
label="Select Columns to Display:",
|
70 |
+
),
|
71 |
+
search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
|
72 |
+
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
73 |
+
filter_columns=[
|
74 |
+
ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
|
75 |
+
ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
|
76 |
+
ColumnFilter(
|
77 |
+
AutoEvalColumn.params.name,
|
78 |
+
type="slider",
|
79 |
+
min=0.01,
|
80 |
+
max=150,
|
81 |
+
label="Select the number of parameters (B)",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
82 |
),
|
83 |
+
ColumnFilter(
|
84 |
+
AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
|
|
|
|
|
85 |
),
|
86 |
+
],
|
87 |
+
bool_checkboxgroup_label="Hide models",
|
88 |
+
interactive=False,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
)
|
90 |
|
91 |
|
92 |
+
demo = gr.Blocks(css=custom_css)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
with demo:
|
94 |
gr.HTML(TITLE)
|
95 |
+
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
|
|
96 |
|
97 |
+
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
98 |
+
with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
|
99 |
+
leaderboard = init_leaderboard(LEADERBOARD_DF)
|
100 |
|
101 |
+
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
102 |
+
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
103 |
|
104 |
+
with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
|
105 |
+
with gr.Column():
|
106 |
with gr.Row():
|
107 |
+
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
108 |
+
|
109 |
+
with gr.Column():
|
110 |
+
with gr.Accordion(
|
111 |
+
f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
|
112 |
+
open=False,
|
113 |
+
):
|
114 |
+
with gr.Row():
|
115 |
+
finished_eval_table = gr.components.Dataframe(
|
116 |
+
value=finished_eval_queue_df,
|
117 |
+
headers=EVAL_COLS,
|
118 |
+
datatype=EVAL_TYPES,
|
119 |
+
row_count=5,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
120 |
)
|
121 |
+
with gr.Accordion(
|
122 |
+
f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
|
123 |
+
open=False,
|
124 |
+
):
|
125 |
+
with gr.Row():
|
126 |
+
running_eval_table = gr.components.Dataframe(
|
127 |
+
value=running_eval_queue_df,
|
128 |
+
headers=EVAL_COLS,
|
129 |
+
datatype=EVAL_TYPES,
|
130 |
+
row_count=5,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
131 |
)
|
|
|
|
|
132 |
|
133 |
+
with gr.Accordion(
|
134 |
+
f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
|
135 |
+
open=False,
|
136 |
+
):
|
137 |
+
with gr.Row():
|
138 |
+
pending_eval_table = gr.components.Dataframe(
|
139 |
+
value=pending_eval_queue_df,
|
140 |
+
headers=EVAL_COLS,
|
141 |
+
datatype=EVAL_TYPES,
|
142 |
+
row_count=5,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
143 |
)
|
144 |
+
with gr.Row():
|
145 |
+
gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
|
146 |
+
|
147 |
+
with gr.Row():
|
148 |
+
with gr.Column():
|
149 |
+
model_name_textbox = gr.Textbox(label="Model name")
|
150 |
+
revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
|
151 |
+
model_type = gr.Dropdown(
|
152 |
+
choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
|
153 |
+
label="Model type",
|
154 |
+
multiselect=False,
|
155 |
+
value=None,
|
156 |
+
interactive=True,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
157 |
)
|
158 |
|
159 |
+
with gr.Column():
|
160 |
+
precision = gr.Dropdown(
|
161 |
+
choices=[i.value.name for i in Precision if i != Precision.Unknown],
|
162 |
+
label="Precision",
|
163 |
+
multiselect=False,
|
164 |
+
value="float16",
|
165 |
+
interactive=True,
|
166 |
+
)
|
167 |
+
weight_type = gr.Dropdown(
|
168 |
+
choices=[i.value.name for i in WeightType],
|
169 |
+
label="Weights type",
|
170 |
+
multiselect=False,
|
171 |
+
value="Original",
|
172 |
+
interactive=True,
|
173 |
+
)
|
174 |
+
base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
|
175 |
+
|
176 |
+
submit_button = gr.Button("Submit Eval")
|
177 |
+
submission_result = gr.Markdown()
|
178 |
+
submit_button.click(
|
179 |
+
add_new_eval,
|
180 |
+
[
|
181 |
+
model_name_textbox,
|
182 |
+
base_model_name_textbox,
|
183 |
+
revision_name_textbox,
|
184 |
+
precision,
|
185 |
+
weight_type,
|
186 |
+
model_type,
|
187 |
+
],
|
188 |
+
submission_result,
|
189 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
190 |
|
191 |
+
with gr.Row():
|
192 |
+
with gr.Accordion("📙 Citation", open=False):
|
193 |
+
citation_button = gr.Textbox(
|
194 |
+
value=CITATION_BUTTON_TEXT,
|
195 |
+
label=CITATION_BUTTON_LABEL,
|
196 |
+
lines=20,
|
197 |
+
elem_id="citation-button",
|
198 |
+
show_copy_button=True,
|
199 |
+
)
|
200 |
|
|
|
201 |
scheduler = BackgroundScheduler()
|
202 |
+
scheduler.add_job(restart_space, "interval", seconds=1800)
|
203 |
scheduler.start()
|
204 |
+
demo.queue(default_concurrency_limit=40).launch()
|
|
|
|
|
|
data/.gitkeep
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
# Keep this directory in git
|
|
|
|
data/leaderboard_data.json
DELETED
@@ -1,30 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"leaderboard": [
|
3 |
-
{
|
4 |
-
"model_name": "example/model",
|
5 |
-
"programming_language": "Python",
|
6 |
-
"comment_language": "English",
|
7 |
-
"taxonomy_category": "Bug Detection",
|
8 |
-
"bleu": 0.5,
|
9 |
-
"llm_pass_1": 0.5,
|
10 |
-
"llm_pass_5": 0.5,
|
11 |
-
"llm_pass_10": 0.5,
|
12 |
-
"metrics": {
|
13 |
-
"readability": 5,
|
14 |
-
"relevance": 5,
|
15 |
-
"explanation_clarity": 5,
|
16 |
-
"problem_identification": 5,
|
17 |
-
"actionability": 5,
|
18 |
-
"completeness": 5,
|
19 |
-
"specificity": 5,
|
20 |
-
"contextual_adequacy": 5,
|
21 |
-
"consistency": 5,
|
22 |
-
"brevity": 5
|
23 |
-
},
|
24 |
-
"submission_ip": "127.0.0.1",
|
25 |
-
"submission_date": "2024-01-01T00:00:00Z"
|
26 |
-
}
|
27 |
-
],
|
28 |
-
"last_updated": "2025-07-03T13:10:47.434623+00:00",
|
29 |
-
"total_entries": 1
|
30 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/submissions.json
DELETED
@@ -1,5 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"submissions": [],
|
3 |
-
"last_updated": "2025-07-03T13:10:47.435548+00:00",
|
4 |
-
"total_submissions": 0
|
5 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
example_submission.jsonl
DELETED
@@ -1,4 +0,0 @@
|
|
1 |
-
{"model_name": "GPT-4-CodeReview", "programming_language": "Python", "comment_language": "en", "topic": "Code Reliability", "observation_id": "obs_001", "code_snippet": "def calculate_sum(a, b):\n return a + b", "review_text": "This function is simple and correct, but consider adding type hints and docstring for better documentation.", "readability": 8.5, "relevance": 9.0, "explanation_clarity": 7.8, "problem_identification": 8.2, "actionability": 8.7, "completeness": 8.0, "specificity": 7.5, "contextual_adequacy": 8.3, "consistency": 8.8, "brevity": 7.2, "pass_at_1": 0.75, "pass_at_5": 0.88, "pass_at_10": 0.92, "bleu_at_10": 0.65, "total_evaluations": 100}
|
2 |
-
{"model_name": "GPT-4-CodeReview", "programming_language": "Java", "comment_language": "en", "topic": "Coding Standards", "observation_id": "obs_002", "code_snippet": "public class Calculator {\n public int add(int a, int b) {\n return a + b;\n }\n}", "review_text": "Consider following Java naming conventions and adding JavaDoc comments. The method is functionally correct.", "readability": 8.2, "relevance": 8.8, "explanation_clarity": 7.5, "problem_identification": 8.0, "actionability": 8.5, "completeness": 7.8, "specificity": 7.2, "contextual_adequacy": 8.1, "consistency": 8.6, "brevity": 7.0, "pass_at_1": 0.72, "pass_at_5": 0.85, "pass_at_10": 0.90, "bleu_at_10": 0.62, "total_evaluations": 100}
|
3 |
-
{"model_name": "Claude-3-CodeReview", "programming_language": "Scala", "comment_language": "ru", "topic": "Performance Issues", "observation_id": "obs_003", "code_snippet": "def fibonacci(n: Int): Int = {\n if (n <= 1) n\n else fibonacci(n-1) + fibonacci(n-2)\n}", "review_text": "Эта реализация неэффективна из-за экспоненциальной сложности. Рекомендуется использовать мемоизацию или итеративный подход.", "readability": 8.8, "relevance": 8.5, "explanation_clarity": 8.2, "problem_identification": 9.2, "actionability": 8.3, "completeness": 8.5, "specificity": 8.0, "contextual_adequacy": 8.6, "consistency": 8.2, "brevity": 8.8, "pass_at_1": 0.78, "pass_at_5": 0.89, "pass_at_10": 0.93, "bleu_at_10": 0.68, "total_evaluations": 100}
|
4 |
-
{"model_name": "Llama-CodeReview", "programming_language": "Go", "comment_language": "en", "topic": "Variables", "observation_id": "obs_004", "code_snippet": "package main\n\nimport \"fmt\"\n\nfunc main() {\n var x int = 5\n var y int = 10\n fmt.Println(x + y)\n}", "review_text": "Consider using short variable declarations (:=) for local variables. Also, the variable names could be more descriptive.", "readability": 7.5, "relevance": 7.8, "explanation_clarity": 7.0, "problem_identification": 7.5, "actionability": 7.2, "completeness": 7.8, "specificity": 6.8, "contextual_adequacy": 7.3, "consistency": 7.6, "brevity": 6.5, "pass_at_1": 0.65, "pass_at_5": 0.78, "pass_at_10": 0.85, "bleu_at_10": 0.55, "total_evaluations": 100}
|
|
|
|
|
|
|
|
|
|
gradio_test.ipynb
DELETED
@@ -1,32 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"cells": [
|
3 |
-
{
|
4 |
-
"cell_type": "code",
|
5 |
-
"execution_count": null,
|
6 |
-
"metadata": {},
|
7 |
-
"outputs": [],
|
8 |
-
"source": []
|
9 |
-
}
|
10 |
-
],
|
11 |
-
"metadata": {
|
12 |
-
"kernelspec": {
|
13 |
-
"display_name": "agent_env",
|
14 |
-
"language": "python",
|
15 |
-
"name": "python3"
|
16 |
-
},
|
17 |
-
"language_info": {
|
18 |
-
"codemirror_mode": {
|
19 |
-
"name": "ipython",
|
20 |
-
"version": 3
|
21 |
-
},
|
22 |
-
"file_extension": ".py",
|
23 |
-
"mimetype": "text/x-python",
|
24 |
-
"name": "python",
|
25 |
-
"nbconvert_exporter": "python",
|
26 |
-
"pygments_lexer": "ipython3",
|
27 |
-
"version": "3.13.2"
|
28 |
-
}
|
29 |
-
},
|
30 |
-
"nbformat": 4,
|
31 |
-
"nbformat_minor": 2
|
32 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
leaderboard_data.json
DELETED
@@ -1,32 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"entries": [
|
3 |
-
{
|
4 |
-
"model_name": "GPT-4-CodeReview",
|
5 |
-
"model_type": "LLM",
|
6 |
-
"mode": "Strict",
|
7 |
-
"review_model_type": "gpt-4",
|
8 |
-
"programming_language": "Python",
|
9 |
-
"comment_language": "en",
|
10 |
-
"topic": "Code Reliability",
|
11 |
-
"submission_date": "2024-10-06T12:00:00Z",
|
12 |
-
"version": "v0",
|
13 |
-
"readability": 8.5,
|
14 |
-
"relevance": 9.0,
|
15 |
-
"explanation_clarity": 7.8,
|
16 |
-
"problem_identification": 8.2,
|
17 |
-
"actionability": 8.7,
|
18 |
-
"completeness": 8.0,
|
19 |
-
"specificity": 7.5,
|
20 |
-
"contextual_adequacy": 8.3,
|
21 |
-
"consistency": 8.8,
|
22 |
-
"brevity": 7.2,
|
23 |
-
"pass_at_1": 0.75,
|
24 |
-
"pass_at_5": 0.88,
|
25 |
-
"pass_at_10": 0.92,
|
26 |
-
"bleu_at_10": 0.65,
|
27 |
-
"total_evaluations": 100
|
28 |
-
}
|
29 |
-
],
|
30 |
-
"last_updated": "2024-10-06T12:00:00Z",
|
31 |
-
"version": "v0"
|
32 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
CHANGED
@@ -1,8 +1,16 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
APScheduler
|
2 |
+
black
|
3 |
+
datasets
|
4 |
+
gradio
|
5 |
+
gradio[oauth]
|
6 |
+
gradio_leaderboard==0.0.13
|
7 |
+
gradio_client
|
8 |
+
huggingface-hub>=0.18.0
|
9 |
+
matplotlib
|
10 |
+
numpy
|
11 |
+
pandas
|
12 |
+
python-dateutil
|
13 |
+
tqdm
|
14 |
+
transformers
|
15 |
+
tokenizers>=0.15.0
|
16 |
+
sentencepiece
|
src/__init__.py
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
# CodeReview Leaderboard - Source Module
|
|
|
|
src/about.py
CHANGED
@@ -1,59 +1,72 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
"""
|
4 |
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
|
11 |
-
INTRODUCTION_TEXT = """
|
12 |
-
## Introduction
|
13 |
|
14 |
-
|
15 |
-
|
16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
|
18 |
-
Models are evaluated on their ability to provide high-quality code reviews that are helpful,
|
19 |
-
accurate, and actionable across multiple programming languages and review categories.
|
20 |
-
"""
|
21 |
|
22 |
-
LLM_BENCHMARKS_TEXT = """
|
23 |
-
CodeReview Bench is a comprehensive benchmark for evaluating automated code review systems across programming languages and review quality dimensions.
|
24 |
|
25 |
-
|
|
|
26 |
|
27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
|
29 |
"""
|
30 |
|
31 |
EVALUATION_QUEUE_TEXT = """
|
32 |
-
##
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
|
34 |
-
|
|
|
35 |
|
36 |
-
|
37 |
-
|
38 |
-
3. Once validated, your model will appear on the leaderboard.
|
39 |
|
40 |
-
###
|
41 |
-
|
42 |
-
- Submissions should cover multiple programming languages where applicable
|
43 |
-
- Both Russian and English comment languages are supported
|
44 |
|
45 |
-
|
|
|
|
|
|
|
46 |
"""
|
47 |
|
48 |
-
CITATION_BUTTON_LABEL = "
|
49 |
-
|
50 |
-
CITATION_BUTTON_TEXT = """
|
51 |
-
@misc{codereviewbench2025,
|
52 |
-
author = {CodeReview Bench Team},
|
53 |
-
title = {CodeReview Bench: Comprehensive Benchmark for Automated Code Review Systems},
|
54 |
-
year = {2025},
|
55 |
-
publisher = {GitHub},
|
56 |
-
journal = {GitHub repository},
|
57 |
-
howpublished = {\\url{https://github.com/your-org/codereview-bench}}
|
58 |
-
}
|
59 |
"""
|
|
|
1 |
+
from dataclasses import dataclass
|
2 |
+
from enum import Enum
|
|
|
3 |
|
4 |
+
@dataclass
|
5 |
+
class Task:
|
6 |
+
benchmark: str
|
7 |
+
metric: str
|
8 |
+
col_name: str
|
9 |
|
|
|
|
|
10 |
|
11 |
+
# Select your tasks here
|
12 |
+
# ---------------------------------------------------
|
13 |
+
class Tasks(Enum):
|
14 |
+
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
15 |
+
task0 = Task("anli_r1", "acc", "ANLI")
|
16 |
+
task1 = Task("logiqa", "acc_norm", "LogiQA")
|
17 |
+
|
18 |
+
NUM_FEWSHOT = 0 # Change with your few shot
|
19 |
+
# ---------------------------------------------------
|
20 |
|
|
|
|
|
|
|
21 |
|
|
|
|
|
22 |
|
23 |
+
# Your leaderboard name
|
24 |
+
TITLE = """<h1 align="center" id="space-title">Demo leaderboard</h1>"""
|
25 |
|
26 |
+
# What does your leaderboard evaluate?
|
27 |
+
INTRODUCTION_TEXT = """
|
28 |
+
Intro text
|
29 |
+
"""
|
30 |
+
|
31 |
+
# Which evaluations are you running? how can people reproduce what you have?
|
32 |
+
LLM_BENCHMARKS_TEXT = f"""
|
33 |
+
## How it works
|
34 |
+
|
35 |
+
## Reproducibility
|
36 |
+
To reproduce our results, here is the commands you can run:
|
37 |
|
38 |
"""
|
39 |
|
40 |
EVALUATION_QUEUE_TEXT = """
|
41 |
+
## Some good practices before submitting a model
|
42 |
+
|
43 |
+
### 1) Make sure you can load your model and tokenizer using AutoClasses:
|
44 |
+
```python
|
45 |
+
from transformers import AutoConfig, AutoModel, AutoTokenizer
|
46 |
+
config = AutoConfig.from_pretrained("your model name", revision=revision)
|
47 |
+
model = AutoModel.from_pretrained("your model name", revision=revision)
|
48 |
+
tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
|
49 |
+
```
|
50 |
+
If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
|
51 |
+
|
52 |
+
Note: make sure your model is public!
|
53 |
+
Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
|
54 |
|
55 |
+
### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
|
56 |
+
It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
|
57 |
|
58 |
+
### 3) Make sure your model has an open license!
|
59 |
+
This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
|
|
|
60 |
|
61 |
+
### 4) Fill up your model card
|
62 |
+
When we add extra information about models to the leaderboard, it will be automatically taken from the model card
|
|
|
|
|
63 |
|
64 |
+
## In case of model failure
|
65 |
+
If your model is displayed in the `FAILED` category, its execution stopped.
|
66 |
+
Make sure you have followed the above steps first.
|
67 |
+
If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
|
68 |
"""
|
69 |
|
70 |
+
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
71 |
+
CITATION_BUTTON_TEXT = r"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
"""
|
src/display/__init__.py
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
# Display utilities module
|
|
|
|
src/display/css_html_js.py
CHANGED
@@ -1,97 +1,105 @@
|
|
1 |
-
"""
|
2 |
-
CSS and styling for the CodeReview Bench Leaderboard.
|
3 |
-
"""
|
4 |
-
|
5 |
custom_css = """
|
|
|
6 |
.markdown-text {
|
7 |
font-size: 16px !important;
|
8 |
-
text-align: justify !important;
|
9 |
-
line-height: 1.0 !important;
|
10 |
-
margin-top: 10px !important;
|
11 |
-
margin-bottom: 10px !important;
|
12 |
}
|
13 |
|
14 |
-
|
15 |
-
|
16 |
-
background: #3f3f46 !important;
|
17 |
-
color: #f4f4f5 !important;
|
18 |
}
|
19 |
|
20 |
-
#citation-button
|
21 |
-
font-
|
22 |
}
|
23 |
|
24 |
-
|
25 |
-
|
26 |
}
|
27 |
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
padding: 10px;
|
32 |
-
margin-top: 15px;
|
33 |
-
border-radius: 5px;
|
34 |
}
|
35 |
|
36 |
-
|
37 |
-
|
38 |
-
color: #a1a1aa !important;
|
39 |
}
|
40 |
|
41 |
-
|
42 |
-
|
43 |
}
|
44 |
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
}
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
padding: 5px;
|
53 |
-
border-radius: 5px;
|
54 |
}
|
55 |
|
56 |
-
|
57 |
-
|
58 |
-
|
|
|
|
|
|
|
59 |
}
|
60 |
|
61 |
-
.
|
62 |
-
|
63 |
-
border-radius: 5px;
|
64 |
}
|
65 |
|
66 |
-
|
67 |
-
|
68 |
-
|
|
|
|
|
|
|
|
|
69 |
}
|
70 |
|
71 |
-
|
72 |
-
|
73 |
-
top: -5px;
|
74 |
}
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
.form,
|
80 |
-
.panel {
|
81 |
-
/* background: #18181b !important; */ /* Removed background override */
|
82 |
-
border-color: #27272a80 !important; /* Made border color semi-transparent */
|
83 |
-
border-width: 1px !important; /* Ensure border is visible */
|
84 |
-
border-style: solid !important;
|
85 |
}
|
86 |
-
|
87 |
-
|
88 |
-
.gradio-file .wrap {
|
89 |
-
/* background: #18181b !important; */ /* Removed background override */
|
90 |
-
border-color: #27272a !important;
|
91 |
}
|
92 |
-
|
93 |
-
|
94 |
-
margin-
|
95 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
}
|
97 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
custom_css = """
|
2 |
+
|
3 |
.markdown-text {
|
4 |
font-size: 16px !important;
|
|
|
|
|
|
|
|
|
5 |
}
|
6 |
|
7 |
+
#models-to-add-text {
|
8 |
+
font-size: 18px !important;
|
|
|
|
|
9 |
}
|
10 |
|
11 |
+
#citation-button span {
|
12 |
+
font-size: 16px !important;
|
13 |
}
|
14 |
|
15 |
+
#citation-button textarea {
|
16 |
+
font-size: 16px !important;
|
17 |
}
|
18 |
|
19 |
+
#citation-button > label > button {
|
20 |
+
margin: 6px;
|
21 |
+
transform: scale(1.3);
|
|
|
|
|
|
|
22 |
}
|
23 |
|
24 |
+
#leaderboard-table {
|
25 |
+
margin-top: 15px
|
|
|
26 |
}
|
27 |
|
28 |
+
#leaderboard-table-lite {
|
29 |
+
margin-top: 15px
|
30 |
}
|
31 |
|
32 |
+
#search-bar-table-box > div:first-child {
|
33 |
+
background: none;
|
34 |
+
border: none;
|
35 |
}
|
36 |
+
|
37 |
+
#search-bar {
|
38 |
+
padding: 0px;
|
|
|
|
|
39 |
}
|
40 |
|
41 |
+
/* Limit the width of the first AutoEvalColumn so that names don't expand too much */
|
42 |
+
#leaderboard-table td:nth-child(2),
|
43 |
+
#leaderboard-table th:nth-child(2) {
|
44 |
+
max-width: 400px;
|
45 |
+
overflow: auto;
|
46 |
+
white-space: nowrap;
|
47 |
}
|
48 |
|
49 |
+
.tab-buttons button {
|
50 |
+
font-size: 20px;
|
|
|
51 |
}
|
52 |
|
53 |
+
#scale-logo {
|
54 |
+
border-style: none !important;
|
55 |
+
box-shadow: none;
|
56 |
+
display: block;
|
57 |
+
margin-left: auto;
|
58 |
+
margin-right: auto;
|
59 |
+
max-width: 600px;
|
60 |
}
|
61 |
|
62 |
+
#scale-logo .download {
|
63 |
+
display: none;
|
|
|
64 |
}
|
65 |
+
#filter_type{
|
66 |
+
border: 0;
|
67 |
+
padding-left: 0;
|
68 |
+
padding-top: 0;
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
}
|
70 |
+
#filter_type label {
|
71 |
+
display: flex;
|
|
|
|
|
|
|
72 |
}
|
73 |
+
#filter_type label > span{
|
74 |
+
margin-top: var(--spacing-lg);
|
75 |
+
margin-right: 0.5em;
|
76 |
+
}
|
77 |
+
#filter_type label > .wrap{
|
78 |
+
width: 103px;
|
79 |
+
}
|
80 |
+
#filter_type label > .wrap .wrap-inner{
|
81 |
+
padding: 2px;
|
82 |
+
}
|
83 |
+
#filter_type label > .wrap .wrap-inner input{
|
84 |
+
width: 1px
|
85 |
+
}
|
86 |
+
#filter-columns-type{
|
87 |
+
border:0;
|
88 |
+
padding:0.5;
|
89 |
+
}
|
90 |
+
#filter-columns-size{
|
91 |
+
border:0;
|
92 |
+
padding:0.5;
|
93 |
+
}
|
94 |
+
#box-filter > .form{
|
95 |
+
border: 0
|
96 |
}
|
97 |
"""
|
98 |
+
|
99 |
+
get_window_url_params = """
|
100 |
+
function(url_params) {
|
101 |
+
const params = new URLSearchParams(window.location.search);
|
102 |
+
url_params = Object.fromEntries(params);
|
103 |
+
return url_params;
|
104 |
+
}
|
105 |
+
"""
|
src/display/formatting.py
CHANGED
@@ -1,71 +1,27 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
"""
|
4 |
|
5 |
-
import pandas as pd
|
6 |
-
import numpy as np
|
7 |
|
|
|
|
|
|
|
8 |
|
9 |
-
def make_clickable_model(model_name: str) -> str:
|
10 |
-
"""
|
11 |
-
Create a clickable link for a model name.
|
12 |
-
"""
|
13 |
-
return f'<a href="https://huggingface.co/{model_name}" target="_blank">{model_name}</a>'
|
14 |
|
|
|
|
|
15 |
|
16 |
-
def has_no_nan_values(df: pd.DataFrame, columns: list) -> pd.Series:
|
17 |
-
"""
|
18 |
-
Check if a row has no NaN values in the specified columns.
|
19 |
-
"""
|
20 |
-
return ~df[columns].isna().any(axis=1)
|
21 |
|
|
|
|
|
22 |
|
23 |
-
def format_percentage(value: float) -> str:
|
24 |
-
"""
|
25 |
-
Format a value as a percentage.
|
26 |
-
"""
|
27 |
-
if pd.isna(value):
|
28 |
-
return "N/A"
|
29 |
-
return f"{value * 100:.2f}%"
|
30 |
|
|
|
|
|
31 |
|
32 |
-
def format_number(value: float, precision: int = 2) -> str:
|
33 |
-
"""
|
34 |
-
Format a number with specified precision.
|
35 |
-
"""
|
36 |
-
if pd.isna(value):
|
37 |
-
return "N/A"
|
38 |
-
return f"{value:.{precision}f}"
|
39 |
|
|
|
|
|
40 |
|
41 |
-
def styled_message(message: str) -> str:
|
42 |
-
"""
|
43 |
-
Format a success message with styling.
|
44 |
-
"""
|
45 |
-
return f"""
|
46 |
-
<div style="padding: 10px; border-radius: 5px; background-color: #e6f7e6; color: #2e7d32; border: 1px solid #2e7d32;">
|
47 |
-
✅ {message}
|
48 |
-
</div>
|
49 |
-
"""
|
50 |
|
51 |
-
|
52 |
-
|
53 |
-
"""
|
54 |
-
Format a warning message with styling.
|
55 |
-
"""
|
56 |
-
return f"""
|
57 |
-
<div style="padding: 10px; border-radius: 5px; background-color: #fff8e1; color: #ff8f00; border: 1px solid #ff8f00;">
|
58 |
-
⚠️ {message}
|
59 |
-
</div>
|
60 |
-
"""
|
61 |
-
|
62 |
-
|
63 |
-
def styled_error(message: str) -> str:
|
64 |
-
"""
|
65 |
-
Format an error message with styling.
|
66 |
-
"""
|
67 |
-
return f"""
|
68 |
-
<div style="padding: 10px; border-radius: 5px; background-color: #ffebee; color: #c62828; border: 1px solid #c62828;">
|
69 |
-
❌ {message}
|
70 |
-
</div>
|
71 |
-
"""
|
|
|
1 |
+
def model_hyperlink(link, model_name):
|
2 |
+
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
|
|
3 |
|
|
|
|
|
4 |
|
5 |
+
def make_clickable_model(model_name):
|
6 |
+
link = f"https://huggingface.co/{model_name}"
|
7 |
+
return model_hyperlink(link, model_name)
|
8 |
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
+
def styled_error(error):
|
11 |
+
return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
|
12 |
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
+
def styled_warning(warn):
|
15 |
+
return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
|
16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
|
18 |
+
def styled_message(message):
|
19 |
+
return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
|
20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
|
22 |
+
def has_no_nan_values(df, columns):
|
23 |
+
return df[columns].notna().all(axis=1)
|
24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
|
26 |
+
def has_nan_values(df, columns):
|
27 |
+
return df[columns].isna().any(axis=1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/display/utils.py
CHANGED
@@ -1,417 +1,110 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
"""
|
4 |
|
5 |
-
|
6 |
-
from enum import Enum, auto
|
7 |
-
from typing import List, Optional
|
8 |
|
|
|
9 |
|
10 |
-
|
11 |
-
|
12 |
-
CoT = auto() # Chain of Thought
|
13 |
-
Strict = auto()
|
14 |
-
|
15 |
-
def __str__(self):
|
16 |
-
"""String representation of the mode."""
|
17 |
-
return self.name
|
18 |
-
|
19 |
-
|
20 |
-
class ModelType(Enum):
|
21 |
-
"""Model types for the leaderboard."""
|
22 |
-
Unknown = auto()
|
23 |
-
OpenSource = auto()
|
24 |
-
ClosedSource = auto()
|
25 |
-
API = auto()
|
26 |
-
|
27 |
-
def to_str(self, separator: str = "-") -> str:
|
28 |
-
"""Convert enum to string with separator."""
|
29 |
-
if self == ModelType.Unknown:
|
30 |
-
return "Unknown"
|
31 |
-
elif self == ModelType.OpenSource:
|
32 |
-
return f"Open{separator}Source"
|
33 |
-
elif self == ModelType.ClosedSource:
|
34 |
-
return f"Closed{separator}Source"
|
35 |
-
elif self == ModelType.API:
|
36 |
-
return "API"
|
37 |
-
return "Unknown"
|
38 |
-
|
39 |
-
|
40 |
-
class ReviewModelType(str, Enum):
|
41 |
-
"""Review model types for the leaderboard."""
|
42 |
-
GPT_4 = "gpt-4"
|
43 |
-
GPT_3_5 = "gpt-3.5-turbo"
|
44 |
-
CLAUDE = "claude"
|
45 |
-
LLAMA = "llama"
|
46 |
-
GEMINI = "gemini"
|
47 |
-
CUSTOM = "custom"
|
48 |
-
|
49 |
-
def __str__(self):
|
50 |
-
"""String representation of the review model type."""
|
51 |
-
return self.value
|
52 |
-
|
53 |
-
|
54 |
-
class Precision(Enum):
|
55 |
-
"""Model precision types."""
|
56 |
-
Unknown = auto()
|
57 |
-
float16 = auto()
|
58 |
-
bfloat16 = auto()
|
59 |
-
float32 = auto()
|
60 |
-
int8 = auto()
|
61 |
-
int4 = auto()
|
62 |
-
NA = auto()
|
63 |
-
|
64 |
-
def __str__(self):
|
65 |
-
"""String representation of the precision type."""
|
66 |
-
return self.name
|
67 |
-
|
68 |
-
|
69 |
-
class WeightType(Enum):
|
70 |
-
"""Model weight types."""
|
71 |
-
Original = auto()
|
72 |
-
Delta = auto()
|
73 |
-
Adapter = auto()
|
74 |
-
|
75 |
-
def __str__(self):
|
76 |
-
"""String representation of the weight type."""
|
77 |
-
return self.name
|
78 |
|
79 |
|
|
|
|
|
|
|
80 |
@dataclass
|
81 |
-
class
|
82 |
-
"""Information about a column in the leaderboard."""
|
83 |
name: str
|
84 |
-
|
85 |
-
|
86 |
hidden: bool = False
|
87 |
never_hidden: bool = False
|
88 |
-
displayed_by_default: bool = True
|
89 |
-
|
90 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
@dataclass
|
92 |
-
class
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
name="model_name",
|
97 |
-
display_name="Model",
|
98 |
-
never_hidden=True,
|
99 |
-
displayed_by_default=True
|
100 |
-
))
|
101 |
-
mode: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
102 |
-
name="mode",
|
103 |
-
display_name="Mode",
|
104 |
-
displayed_by_default=True
|
105 |
-
))
|
106 |
-
model_type: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
107 |
-
name="model_type",
|
108 |
-
display_name="Access_Type",
|
109 |
-
displayed_by_default=True
|
110 |
-
))
|
111 |
-
submission_date: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
112 |
-
name="submission_date",
|
113 |
-
display_name="Submission_Date",
|
114 |
-
displayed_by_default=False
|
115 |
-
))
|
116 |
-
version: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
117 |
-
name="version",
|
118 |
-
display_name="Version",
|
119 |
-
displayed_by_default=False
|
120 |
-
))
|
121 |
-
review_model_type: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
122 |
-
name="review_model_type",
|
123 |
-
display_name="Type",
|
124 |
-
displayed_by_default=False
|
125 |
-
))
|
126 |
-
base_model: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
127 |
-
name="base_model",
|
128 |
-
display_name="Base Model",
|
129 |
-
displayed_by_default=False
|
130 |
-
))
|
131 |
-
revision: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
132 |
-
name="revision",
|
133 |
-
display_name="Revision",
|
134 |
-
displayed_by_default=False
|
135 |
-
))
|
136 |
-
precision: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
137 |
-
name="precision",
|
138 |
-
display_name="Precision",
|
139 |
-
displayed_by_default=False
|
140 |
-
))
|
141 |
-
weight_type: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
142 |
-
name="weight_type",
|
143 |
-
display_name="Weight Type",
|
144 |
-
displayed_by_default=False
|
145 |
-
))
|
146 |
-
topic: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
147 |
-
name="topic",
|
148 |
-
display_name="Topic",
|
149 |
-
displayed_by_default=True
|
150 |
-
))
|
151 |
-
|
152 |
-
# LLM-based multimetric scores
|
153 |
-
readability: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
154 |
-
name="readability",
|
155 |
-
display_name="Readability",
|
156 |
-
type="number",
|
157 |
-
displayed_by_default=True
|
158 |
-
))
|
159 |
-
relevance: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
160 |
-
name="relevance",
|
161 |
-
display_name="Relevance",
|
162 |
-
type="number",
|
163 |
-
displayed_by_default=True
|
164 |
-
))
|
165 |
-
explanation_clarity: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
166 |
-
name="explanation_clarity",
|
167 |
-
display_name="Explanation_Clarity",
|
168 |
-
type="number",
|
169 |
-
displayed_by_default=True
|
170 |
-
))
|
171 |
-
problem_identification: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
172 |
-
name="problem_identification",
|
173 |
-
display_name="Problem_Identification",
|
174 |
-
type="number",
|
175 |
-
displayed_by_default=True
|
176 |
-
))
|
177 |
-
actionability: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
178 |
-
name="actionability",
|
179 |
-
display_name="Actionability",
|
180 |
-
type="number",
|
181 |
-
displayed_by_default=True
|
182 |
-
))
|
183 |
-
completeness: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
184 |
-
name="completeness",
|
185 |
-
display_name="Completeness",
|
186 |
-
type="number",
|
187 |
-
displayed_by_default=True
|
188 |
-
))
|
189 |
-
specificity: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
190 |
-
name="specificity",
|
191 |
-
display_name="Specificity",
|
192 |
-
type="number",
|
193 |
-
displayed_by_default=True
|
194 |
-
))
|
195 |
-
contextual_adequacy: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
196 |
-
name="contextual_adequacy",
|
197 |
-
display_name="Contextual_Adequacy",
|
198 |
-
type="number",
|
199 |
-
displayed_by_default=True
|
200 |
-
))
|
201 |
-
consistency: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
202 |
-
name="consistency",
|
203 |
-
display_name="Consistency",
|
204 |
-
type="number",
|
205 |
-
displayed_by_default=True
|
206 |
-
))
|
207 |
-
brevity: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
208 |
-
name="brevity",
|
209 |
-
display_name="Brevity",
|
210 |
-
type="number",
|
211 |
-
displayed_by_default=True
|
212 |
-
))
|
213 |
-
|
214 |
-
# LLM-based-exact-match metrics
|
215 |
-
pass_at_1: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
216 |
-
name="pass_at_1",
|
217 |
-
display_name="Pass@1",
|
218 |
-
type="number",
|
219 |
-
displayed_by_default=True
|
220 |
-
))
|
221 |
-
pass_at_5: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
222 |
-
name="pass_at_5",
|
223 |
-
display_name="Pass@5",
|
224 |
-
type="number",
|
225 |
-
displayed_by_default=True
|
226 |
-
))
|
227 |
-
pass_at_10: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
228 |
-
name="pass_at_10",
|
229 |
-
display_name="Pass@10",
|
230 |
-
type="number",
|
231 |
-
displayed_by_default=True
|
232 |
-
))
|
233 |
-
bleu_at_10: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
234 |
-
name="bleu_at_10",
|
235 |
-
display_name="BLEU@10",
|
236 |
-
type="number",
|
237 |
-
displayed_by_default=True
|
238 |
-
))
|
239 |
-
|
240 |
-
# Overall aggregated metrics
|
241 |
-
overall_score: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
242 |
-
name="overall_score",
|
243 |
-
display_name="Overall_Score",
|
244 |
-
type="number",
|
245 |
-
displayed_by_default=True
|
246 |
-
))
|
247 |
-
multimetric_average: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
248 |
-
name="multimetric_average",
|
249 |
-
display_name="Multimetric_Average",
|
250 |
-
type="number",
|
251 |
-
displayed_by_default=True
|
252 |
-
))
|
253 |
-
exact_match_average: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
254 |
-
name="exact_match_average",
|
255 |
-
display_name="Exact_Match_Average",
|
256 |
-
type="number",
|
257 |
-
displayed_by_default=True
|
258 |
-
))
|
259 |
-
total_evaluations: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
260 |
-
name="total_evaluations",
|
261 |
-
display_name="Total_Evaluations",
|
262 |
-
type="number",
|
263 |
-
displayed_by_default=True
|
264 |
-
))
|
265 |
-
|
266 |
-
# Language-specific metrics (Russian)
|
267 |
-
ru_readability: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
268 |
-
name="ru_readability",
|
269 |
-
display_name="RU_Readability",
|
270 |
-
type="number",
|
271 |
-
displayed_by_default=False
|
272 |
-
))
|
273 |
-
ru_relevance: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
274 |
-
name="ru_relevance",
|
275 |
-
display_name="RU_Relevance",
|
276 |
-
type="number",
|
277 |
-
displayed_by_default=False
|
278 |
-
))
|
279 |
-
ru_overall_score: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
280 |
-
name="ru_overall_score",
|
281 |
-
display_name="RU_Overall_Score",
|
282 |
-
type="number",
|
283 |
-
displayed_by_default=False
|
284 |
-
))
|
285 |
-
|
286 |
-
# Language-specific metrics (English)
|
287 |
-
en_readability: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
288 |
-
name="en_readability",
|
289 |
-
display_name="EN_Readability",
|
290 |
-
type="number",
|
291 |
-
displayed_by_default=False
|
292 |
-
))
|
293 |
-
en_relevance: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
294 |
-
name="en_relevance",
|
295 |
-
display_name="EN_Relevance",
|
296 |
-
type="number",
|
297 |
-
displayed_by_default=False
|
298 |
-
))
|
299 |
-
en_overall_score: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
300 |
-
name="en_overall_score",
|
301 |
-
display_name="EN_Overall_Score",
|
302 |
-
type="number",
|
303 |
-
displayed_by_default=False
|
304 |
-
))
|
305 |
-
|
306 |
-
|
307 |
-
# Create instances for easy access
|
308 |
-
CODEREVIEW_COLUMN = CodeReviewBenchColumn()
|
309 |
-
|
310 |
-
# Extract column lists for different views
|
311 |
-
COLS = [f.name for f in fields(CODEREVIEW_COLUMN)]
|
312 |
-
DISPLAY_COLS = [getattr(CODEREVIEW_COLUMN, f.name).name for f in fields(CODEREVIEW_COLUMN)
|
313 |
-
if getattr(CODEREVIEW_COLUMN, f.name).displayed_by_default]
|
314 |
-
|
315 |
-
# Manually reorder DISPLAY_COLS to put 'mode' after 'model_name'
|
316 |
-
def reorder_display_cols():
|
317 |
-
cols = DISPLAY_COLS
|
318 |
-
if 'model_name' in cols and 'mode' in cols:
|
319 |
-
cols.remove('mode')
|
320 |
-
model_name_index = cols.index('model_name')
|
321 |
-
cols.insert(model_name_index + 1, 'mode')
|
322 |
-
return cols
|
323 |
-
DISPLAY_COLS = reorder_display_cols()
|
324 |
-
|
325 |
-
METRIC_COLS = [getattr(CODEREVIEW_COLUMN, f.name).name for f in fields(CODEREVIEW_COLUMN)
|
326 |
-
if getattr(CODEREVIEW_COLUMN, f.name).type == "number"]
|
327 |
-
HIDDEN_COLS = [getattr(CODEREVIEW_COLUMN, f.name).name for f in fields(CODEREVIEW_COLUMN)
|
328 |
-
if getattr(CODEREVIEW_COLUMN, f.name).hidden]
|
329 |
-
NEVER_HIDDEN_COLS = [getattr(CODEREVIEW_COLUMN, f.name).name for f in fields(CODEREVIEW_COLUMN)
|
330 |
-
if getattr(CODEREVIEW_COLUMN, f.name).never_hidden]
|
331 |
-
|
332 |
-
# Categories for CodeReview Bench (Programming Languages)
|
333 |
-
CATEGORIES = [
|
334 |
-
'Python',
|
335 |
-
'Java',
|
336 |
-
'Scala',
|
337 |
-
'Go'
|
338 |
-
]
|
339 |
-
|
340 |
-
# Language taxonomies for CodeReview Bench
|
341 |
-
COMMENT_LANGUAGES = [
|
342 |
-
'ru', # Russian
|
343 |
-
'en' # English
|
344 |
-
]
|
345 |
-
|
346 |
-
# Topics for CodeReview Bench
|
347 |
-
TOPICS = [
|
348 |
-
'Code Reliability',
|
349 |
-
'Coding Standards',
|
350 |
-
'Code Organization',
|
351 |
-
'Performance Issues',
|
352 |
-
'Validation',
|
353 |
-
'Variables'
|
354 |
-
]
|
355 |
-
|
356 |
-
# Example categories
|
357 |
-
EXAMPLE_CATEGORIES = [
|
358 |
-
'Bug_Fix',
|
359 |
-
'Code_Style',
|
360 |
-
'Performance',
|
361 |
-
'Security',
|
362 |
-
'Refactoring',
|
363 |
-
'Documentation',
|
364 |
-
'Testing',
|
365 |
-
'Architecture',
|
366 |
-
'Other'
|
367 |
-
]
|
368 |
|
369 |
-
# Metrics for CodeReview Bench
|
370 |
-
MULTIMETRIC_METRICS = [
|
371 |
-
"readability",
|
372 |
-
"relevance",
|
373 |
-
"explanation_clarity",
|
374 |
-
"problem_identification",
|
375 |
-
"actionability",
|
376 |
-
"completeness",
|
377 |
-
"specificity",
|
378 |
-
"contextual_adequacy",
|
379 |
-
"consistency",
|
380 |
-
"brevity"
|
381 |
-
]
|
382 |
|
383 |
-
|
384 |
-
"
|
385 |
-
"
|
386 |
-
"
|
387 |
-
"
|
388 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
389 |
|
390 |
-
|
391 |
-
""
|
392 |
-
|
|
|
393 |
|
394 |
-
|
395 |
-
|
396 |
-
""
|
397 |
-
|
398 |
|
399 |
-
|
|
|
|
|
|
|
|
|
|
|
400 |
|
401 |
-
|
402 |
-
|
403 |
-
# Create a tuple with both the internal name and display name
|
404 |
-
if column_info.name not in default_visible_columns:
|
405 |
-
column_choices.append((column_info.name, column_info.display_name))
|
406 |
|
407 |
-
|
|
|
408 |
|
409 |
-
|
410 |
-
"""
|
411 |
-
Get the list of column names that should be visible by default.
|
412 |
|
413 |
-
Returns:
|
414 |
-
List of column names that are displayed by default.
|
415 |
-
"""
|
416 |
-
return [getattr(CODEREVIEW_COLUMN, f.name).name for f in fields(CODEREVIEW_COLUMN)
|
417 |
-
if getattr(CODEREVIEW_COLUMN, f.name).displayed_by_default]
|
|
|
1 |
+
from dataclasses import dataclass, make_dataclass
|
2 |
+
from enum import Enum
|
|
|
3 |
|
4 |
+
import pandas as pd
|
|
|
|
|
5 |
|
6 |
+
from src.about import Tasks
|
7 |
|
8 |
+
def fields(raw_class):
|
9 |
+
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
|
12 |
+
# These classes are for user facing column names,
|
13 |
+
# to avoid having to change them all around the code
|
14 |
+
# when a modif is needed
|
15 |
@dataclass
|
16 |
+
class ColumnContent:
|
|
|
17 |
name: str
|
18 |
+
type: str
|
19 |
+
displayed_by_default: bool
|
20 |
hidden: bool = False
|
21 |
never_hidden: bool = False
|
|
|
|
|
22 |
|
23 |
+
## Leaderboard columns
|
24 |
+
auto_eval_column_dict = []
|
25 |
+
# Init
|
26 |
+
auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
27 |
+
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
28 |
+
#Scores
|
29 |
+
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
30 |
+
for task in Tasks:
|
31 |
+
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
32 |
+
# Model information
|
33 |
+
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
34 |
+
auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
35 |
+
auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
|
36 |
+
auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
|
37 |
+
auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
|
38 |
+
auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
|
39 |
+
auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
|
40 |
+
auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
41 |
+
auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
42 |
+
|
43 |
+
# We use make dataclass to dynamically fill the scores from Tasks
|
44 |
+
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
45 |
+
|
46 |
+
## For the queue columns in the submission tab
|
47 |
+
@dataclass(frozen=True)
|
48 |
+
class EvalQueueColumn: # Queue column
|
49 |
+
model = ColumnContent("model", "markdown", True)
|
50 |
+
revision = ColumnContent("revision", "str", True)
|
51 |
+
private = ColumnContent("private", "bool", True)
|
52 |
+
precision = ColumnContent("precision", "str", True)
|
53 |
+
weight_type = ColumnContent("weight_type", "str", "Original")
|
54 |
+
status = ColumnContent("status", "str", True)
|
55 |
+
|
56 |
+
## All the model information that we might need
|
57 |
@dataclass
|
58 |
+
class ModelDetails:
|
59 |
+
name: str
|
60 |
+
display_name: str = ""
|
61 |
+
symbol: str = "" # emoji
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
|
64 |
+
class ModelType(Enum):
|
65 |
+
PT = ModelDetails(name="pretrained", symbol="🟢")
|
66 |
+
FT = ModelDetails(name="fine-tuned", symbol="🔶")
|
67 |
+
IFT = ModelDetails(name="instruction-tuned", symbol="⭕")
|
68 |
+
RL = ModelDetails(name="RL-tuned", symbol="🟦")
|
69 |
+
Unknown = ModelDetails(name="", symbol="?")
|
70 |
+
|
71 |
+
def to_str(self, separator=" "):
|
72 |
+
return f"{self.value.symbol}{separator}{self.value.name}"
|
73 |
+
|
74 |
+
@staticmethod
|
75 |
+
def from_str(type):
|
76 |
+
if "fine-tuned" in type or "🔶" in type:
|
77 |
+
return ModelType.FT
|
78 |
+
if "pretrained" in type or "🟢" in type:
|
79 |
+
return ModelType.PT
|
80 |
+
if "RL-tuned" in type or "🟦" in type:
|
81 |
+
return ModelType.RL
|
82 |
+
if "instruction-tuned" in type or "⭕" in type:
|
83 |
+
return ModelType.IFT
|
84 |
+
return ModelType.Unknown
|
85 |
|
86 |
+
class WeightType(Enum):
|
87 |
+
Adapter = ModelDetails("Adapter")
|
88 |
+
Original = ModelDetails("Original")
|
89 |
+
Delta = ModelDetails("Delta")
|
90 |
|
91 |
+
class Precision(Enum):
|
92 |
+
float16 = ModelDetails("float16")
|
93 |
+
bfloat16 = ModelDetails("bfloat16")
|
94 |
+
Unknown = ModelDetails("?")
|
95 |
|
96 |
+
def from_str(precision):
|
97 |
+
if precision in ["torch.float16", "float16"]:
|
98 |
+
return Precision.float16
|
99 |
+
if precision in ["torch.bfloat16", "bfloat16"]:
|
100 |
+
return Precision.bfloat16
|
101 |
+
return Precision.Unknown
|
102 |
|
103 |
+
# Column selection
|
104 |
+
COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
|
|
|
|
|
|
|
105 |
|
106 |
+
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
107 |
+
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
108 |
|
109 |
+
BENCHMARK_COLS = [t.value.col_name for t in Tasks]
|
|
|
|
|
110 |
|
|
|
|
|
|
|
|
|
|
src/envs.py
CHANGED
@@ -1,27 +1,25 @@
|
|
1 |
import os
|
|
|
2 |
from huggingface_hub import HfApi
|
3 |
-
from dotenv import load_dotenv
|
4 |
|
5 |
-
#
|
6 |
-
|
|
|
7 |
|
8 |
-
#
|
9 |
-
|
10 |
-
OWNER = os.environ.get("OWNER", "codereview-bench") # Change to your org
|
11 |
-
SUBMITTER_TOKEN = os.environ.get("SUBMITTER_TOKEN")
|
12 |
-
ADMIN_USERNAME = os.environ.get("ADMIN_USERNAME")
|
13 |
-
ADMIN_PASSWORD = os.environ.get("ADMIN_PASSWORD")
|
14 |
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
|
19 |
-
#
|
20 |
-
CACHE_PATH
|
21 |
-
DATA_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "data")
|
22 |
|
23 |
-
# Local
|
24 |
-
|
|
|
|
|
|
|
25 |
|
26 |
-
# HF API instance
|
27 |
API = HfApi(token=TOKEN)
|
|
|
1 |
import os
|
2 |
+
|
3 |
from huggingface_hub import HfApi
|
|
|
4 |
|
5 |
+
# Info to change for your repository
|
6 |
+
# ----------------------------------
|
7 |
+
TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
|
8 |
|
9 |
+
OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request dataset, with the correct format!
|
10 |
+
# ----------------------------------
|
|
|
|
|
|
|
|
|
11 |
|
12 |
+
REPO_ID = f"{OWNER}/leaderboard"
|
13 |
+
QUEUE_REPO = f"{OWNER}/requests"
|
14 |
+
RESULTS_REPO = f"{OWNER}/results"
|
15 |
|
16 |
+
# If you setup a cache later, just change HF_HOME
|
17 |
+
CACHE_PATH=os.getenv("HF_HOME", ".")
|
|
|
18 |
|
19 |
+
# Local caches
|
20 |
+
EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
|
21 |
+
EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
|
22 |
+
EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
|
23 |
+
EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
|
24 |
|
|
|
25 |
API = HfApi(token=TOKEN)
|
src/leaderboard/__init__.py
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
# Leaderboard processing module
|
|
|
|
src/leaderboard/processor.py
DELETED
@@ -1,271 +0,0 @@
|
|
1 |
-
"""
|
2 |
-
Process CodeReview Bench leaderboard data and submissions.
|
3 |
-
"""
|
4 |
-
|
5 |
-
import json
|
6 |
-
import os
|
7 |
-
import pandas as pd
|
8 |
-
from datetime import datetime
|
9 |
-
from typing import Dict, List, Tuple, Optional
|
10 |
-
import numpy as np
|
11 |
-
|
12 |
-
from src.display.utils import (
|
13 |
-
CODEREVIEW_COLUMN, DISPLAY_COLS, CATEGORIES, COMMENT_LANGUAGES, EXAMPLE_CATEGORIES,
|
14 |
-
MULTIMETRIC_METRICS, EXACT_MATCH_METRICS
|
15 |
-
)
|
16 |
-
|
17 |
-
|
18 |
-
def process_jsonl_submission(file_path: str) -> Tuple[List[Dict], str]:
|
19 |
-
"""
|
20 |
-
Process a JSONL submission file for CodeReview Bench.
|
21 |
-
|
22 |
-
Args:
|
23 |
-
file_path: Path to the JSONL submission file
|
24 |
-
|
25 |
-
Returns:
|
26 |
-
Tuple of (entries_list, message)
|
27 |
-
"""
|
28 |
-
try:
|
29 |
-
entries = []
|
30 |
-
with open(file_path, 'r', encoding='utf-8') as f:
|
31 |
-
for line_num, line in enumerate(f, 1):
|
32 |
-
line = line.strip()
|
33 |
-
if not line:
|
34 |
-
continue
|
35 |
-
|
36 |
-
try:
|
37 |
-
entry = json.loads(line)
|
38 |
-
|
39 |
-
# Validate required fields
|
40 |
-
required_fields = ['model_name', 'programming_language', 'comment_language']
|
41 |
-
missing_fields = [field for field in required_fields if field not in entry]
|
42 |
-
if missing_fields:
|
43 |
-
return [], f"Missing required fields {missing_fields} in line {line_num}"
|
44 |
-
|
45 |
-
# Validate metrics exist
|
46 |
-
has_multimetric = any(metric in entry for metric in MULTIMETRIC_METRICS)
|
47 |
-
has_exact_match = any(metric in entry for metric in EXACT_MATCH_METRICS)
|
48 |
-
|
49 |
-
if not has_multimetric and not has_exact_match:
|
50 |
-
return [], f"No valid metrics found in line {line_num}. Required: {MULTIMETRIC_METRICS + EXACT_MATCH_METRICS}"
|
51 |
-
|
52 |
-
entries.append(entry)
|
53 |
-
|
54 |
-
except json.JSONDecodeError as e:
|
55 |
-
return [], f"Invalid JSON in line {line_num}: {e}"
|
56 |
-
|
57 |
-
if not entries:
|
58 |
-
return [], "No valid entries found in submission file"
|
59 |
-
|
60 |
-
return entries, f"Successfully processed {len(entries)} entries"
|
61 |
-
|
62 |
-
except Exception as e:
|
63 |
-
return [], f"Error processing submission: {e}"
|
64 |
-
|
65 |
-
|
66 |
-
def calculate_overall_score(entry: Dict) -> float:
|
67 |
-
"""
|
68 |
-
Calculate overall score for a CodeReview Bench entry.
|
69 |
-
|
70 |
-
Args:
|
71 |
-
entry: Dictionary containing model evaluation results
|
72 |
-
|
73 |
-
Returns:
|
74 |
-
Overall score as float
|
75 |
-
"""
|
76 |
-
# Calculate multimetric average
|
77 |
-
multimetric_scores = []
|
78 |
-
for metric in MULTIMETRIC_METRICS:
|
79 |
-
if metric in entry and isinstance(entry[metric], (int, float)):
|
80 |
-
multimetric_scores.append(entry[metric])
|
81 |
-
|
82 |
-
multimetric_avg = np.mean(multimetric_scores) if multimetric_scores else 0
|
83 |
-
|
84 |
-
# Calculate exact match average
|
85 |
-
exact_match_scores = []
|
86 |
-
for metric in EXACT_MATCH_METRICS:
|
87 |
-
if metric in entry and isinstance(entry[metric], (int, float)):
|
88 |
-
exact_match_scores.append(entry[metric])
|
89 |
-
|
90 |
-
exact_match_avg = np.mean(exact_match_scores) if exact_match_scores else 0
|
91 |
-
|
92 |
-
# Weighted combination (can be adjusted based on requirements)
|
93 |
-
overall_score = (multimetric_avg * 0.7) + (exact_match_avg * 0.3)
|
94 |
-
|
95 |
-
return overall_score
|
96 |
-
|
97 |
-
|
98 |
-
def load_leaderboard_data(file_path: str) -> Dict:
|
99 |
-
"""
|
100 |
-
Load the leaderboard data from a JSON file.
|
101 |
-
"""
|
102 |
-
if not os.path.exists(file_path):
|
103 |
-
version = "v0"
|
104 |
-
if "_v" in file_path:
|
105 |
-
version = file_path.split("_")[-1].split(".")[0]
|
106 |
-
return {"entries": [], "last_updated": datetime.now().isoformat(), "version": version}
|
107 |
-
|
108 |
-
with open(file_path, 'r') as f:
|
109 |
-
data = json.load(f)
|
110 |
-
|
111 |
-
# Ensure version field exists
|
112 |
-
if "version" not in data:
|
113 |
-
version = "v0"
|
114 |
-
if "_v" in file_path:
|
115 |
-
version = file_path.split("_")[-1].split(".")[0]
|
116 |
-
data["version"] = version
|
117 |
-
|
118 |
-
return data
|
119 |
-
|
120 |
-
|
121 |
-
def save_leaderboard_data(data: Dict, file_path: str) -> None:
|
122 |
-
"""
|
123 |
-
Save the leaderboard data to a JSON file.
|
124 |
-
"""
|
125 |
-
# Ensure the directory exists
|
126 |
-
os.makedirs(os.path.dirname(file_path), exist_ok=True)
|
127 |
-
|
128 |
-
# Update the last_updated timestamp
|
129 |
-
data["last_updated"] = datetime.now().isoformat()
|
130 |
-
|
131 |
-
# Ensure version is set
|
132 |
-
if "version" not in data:
|
133 |
-
version = "v0"
|
134 |
-
if "_v" in file_path:
|
135 |
-
version = file_path.split("_")[-1].split(".")[0]
|
136 |
-
data["version"] = version
|
137 |
-
|
138 |
-
with open(file_path, 'w') as f:
|
139 |
-
json.dump(data, f, indent=2)
|
140 |
-
|
141 |
-
|
142 |
-
def leaderboard_to_dataframe(leaderboard_data: Dict) -> pd.DataFrame:
|
143 |
-
"""
|
144 |
-
Convert leaderboard data to a pandas DataFrame for display.
|
145 |
-
"""
|
146 |
-
rows = []
|
147 |
-
|
148 |
-
for entry in leaderboard_data.get("entries", []):
|
149 |
-
model_name = entry.get("model_name", "Unknown Model")
|
150 |
-
|
151 |
-
# Extract basic metadata
|
152 |
-
row = {
|
153 |
-
"model_name": model_name,
|
154 |
-
"model_type": entry.get("model_type", "Unknown"),
|
155 |
-
"mode": entry.get("mode", "Strict"),
|
156 |
-
"submission_date": entry.get("submission_date", ""),
|
157 |
-
"version": entry.get("version", "v0"),
|
158 |
-
"review_model_type": entry.get("review_model_type", "custom").lower()
|
159 |
-
}
|
160 |
-
|
161 |
-
# Add additional metadata fields if present
|
162 |
-
for key in ["base_model", "revision", "precision", "weight_type", "topic", "programming_language", "comment_language"]:
|
163 |
-
if key in entry:
|
164 |
-
row[key] = entry[key]
|
165 |
-
|
166 |
-
# Add multimetric scores
|
167 |
-
for metric in MULTIMETRIC_METRICS:
|
168 |
-
if metric in entry:
|
169 |
-
row[metric] = entry[metric]
|
170 |
-
else:
|
171 |
-
row[metric] = pd.NA
|
172 |
-
|
173 |
-
# Add exact match metrics
|
174 |
-
for metric in EXACT_MATCH_METRICS:
|
175 |
-
if metric in entry:
|
176 |
-
row[metric] = entry[metric]
|
177 |
-
else:
|
178 |
-
row[metric] = pd.NA
|
179 |
-
|
180 |
-
# Calculate aggregated metrics
|
181 |
-
multimetric_scores = [entry.get(metric, 0) for metric in MULTIMETRIC_METRICS if metric in entry and pd.notna(entry[metric])]
|
182 |
-
exact_match_scores = [entry.get(metric, 0) for metric in EXACT_MATCH_METRICS if metric in entry and pd.notna(entry[metric])]
|
183 |
-
|
184 |
-
if multimetric_scores:
|
185 |
-
row["multimetric_average"] = np.mean(multimetric_scores)
|
186 |
-
else:
|
187 |
-
row["multimetric_average"] = pd.NA
|
188 |
-
|
189 |
-
if exact_match_scores:
|
190 |
-
row["exact_match_average"] = np.mean(exact_match_scores)
|
191 |
-
else:
|
192 |
-
row["exact_match_average"] = pd.NA
|
193 |
-
|
194 |
-
# Calculate overall score
|
195 |
-
row["overall_score"] = calculate_overall_score(entry)
|
196 |
-
|
197 |
-
# Add language-specific metrics if available
|
198 |
-
for lang in COMMENT_LANGUAGES:
|
199 |
-
for metric in ["readability", "relevance", "overall_score"]:
|
200 |
-
lang_key = f"{lang}_{metric}"
|
201 |
-
if lang_key in entry:
|
202 |
-
row[lang_key] = entry[lang_key]
|
203 |
-
else:
|
204 |
-
row[lang_key] = pd.NA
|
205 |
-
|
206 |
-
# Add evaluation count
|
207 |
-
row["total_evaluations"] = entry.get("total_evaluations", entry.get("evaluation_count", pd.NA))
|
208 |
-
|
209 |
-
rows.append(row)
|
210 |
-
|
211 |
-
# Create DataFrame and sort by overall score
|
212 |
-
df = pd.DataFrame(rows)
|
213 |
-
|
214 |
-
# Ensure all expected columns exist
|
215 |
-
for metric in MULTIMETRIC_METRICS + EXACT_MATCH_METRICS:
|
216 |
-
if metric not in df.columns:
|
217 |
-
df[metric] = pd.NA
|
218 |
-
|
219 |
-
# Sort by overall score (descending)
|
220 |
-
if not df.empty:
|
221 |
-
df = df.sort_values(by="overall_score", ascending=False, na_position='last')
|
222 |
-
|
223 |
-
# Ensure summary columns exist
|
224 |
-
summary_cols = ["overall_score", "multimetric_average", "exact_match_average", "total_evaluations"]
|
225 |
-
for col in summary_cols:
|
226 |
-
if col not in df.columns:
|
227 |
-
df[col] = pd.NA
|
228 |
-
|
229 |
-
return df
|
230 |
-
|
231 |
-
|
232 |
-
def add_entries_to_leaderboard(leaderboard_data: Dict, new_entries: List[Dict]) -> Dict:
|
233 |
-
"""
|
234 |
-
Add new entries to the leaderboard, replacing any with the same model name.
|
235 |
-
"""
|
236 |
-
# Create a mapping of existing entries by model name and version
|
237 |
-
existing_entries = {
|
238 |
-
(entry["model_name"], entry.get("version", "v0")): i
|
239 |
-
for i, entry in enumerate(leaderboard_data.get("entries", []))
|
240 |
-
}
|
241 |
-
|
242 |
-
# Process each new entry
|
243 |
-
for new_entry in new_entries:
|
244 |
-
model_name = new_entry.get("model_name")
|
245 |
-
version = new_entry.get("version", "v0")
|
246 |
-
|
247 |
-
# Add calculated metrics
|
248 |
-
new_entry["overall_score"] = calculate_overall_score(new_entry)
|
249 |
-
|
250 |
-
# Calculate averages
|
251 |
-
multimetric_scores = [new_entry.get(metric) for metric in MULTIMETRIC_METRICS if metric in new_entry and pd.notna(new_entry[metric])]
|
252 |
-
exact_match_scores = [new_entry.get(metric) for metric in EXACT_MATCH_METRICS if metric in new_entry and pd.notna(new_entry[metric])]
|
253 |
-
|
254 |
-
if multimetric_scores:
|
255 |
-
new_entry["multimetric_average"] = np.mean(multimetric_scores)
|
256 |
-
if exact_match_scores:
|
257 |
-
new_entry["exact_match_average"] = np.mean(exact_match_scores)
|
258 |
-
|
259 |
-
if (model_name, version) in existing_entries:
|
260 |
-
# Replace existing entry
|
261 |
-
leaderboard_data["entries"][existing_entries[(model_name, version)]] = new_entry
|
262 |
-
else:
|
263 |
-
# Add new entry
|
264 |
-
if "entries" not in leaderboard_data:
|
265 |
-
leaderboard_data["entries"] = []
|
266 |
-
leaderboard_data["entries"].append(new_entry)
|
267 |
-
|
268 |
-
# Update the last_updated timestamp
|
269 |
-
leaderboard_data["last_updated"] = datetime.now().isoformat()
|
270 |
-
|
271 |
-
return leaderboard_data
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/leaderboard/read_evals.py
ADDED
@@ -0,0 +1,196 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import glob
|
2 |
+
import json
|
3 |
+
import math
|
4 |
+
import os
|
5 |
+
from dataclasses import dataclass
|
6 |
+
|
7 |
+
import dateutil
|
8 |
+
import numpy as np
|
9 |
+
|
10 |
+
from src.display.formatting import make_clickable_model
|
11 |
+
from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
|
12 |
+
from src.submission.check_validity import is_model_on_hub
|
13 |
+
|
14 |
+
|
15 |
+
@dataclass
|
16 |
+
class EvalResult:
|
17 |
+
"""Represents one full evaluation. Built from a combination of the result and request file for a given run.
|
18 |
+
"""
|
19 |
+
eval_name: str # org_model_precision (uid)
|
20 |
+
full_model: str # org/model (path on hub)
|
21 |
+
org: str
|
22 |
+
model: str
|
23 |
+
revision: str # commit hash, "" if main
|
24 |
+
results: dict
|
25 |
+
precision: Precision = Precision.Unknown
|
26 |
+
model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
|
27 |
+
weight_type: WeightType = WeightType.Original # Original or Adapter
|
28 |
+
architecture: str = "Unknown"
|
29 |
+
license: str = "?"
|
30 |
+
likes: int = 0
|
31 |
+
num_params: int = 0
|
32 |
+
date: str = "" # submission date of request file
|
33 |
+
still_on_hub: bool = False
|
34 |
+
|
35 |
+
@classmethod
|
36 |
+
def init_from_json_file(self, json_filepath):
|
37 |
+
"""Inits the result from the specific model result file"""
|
38 |
+
with open(json_filepath) as fp:
|
39 |
+
data = json.load(fp)
|
40 |
+
|
41 |
+
config = data.get("config")
|
42 |
+
|
43 |
+
# Precision
|
44 |
+
precision = Precision.from_str(config.get("model_dtype"))
|
45 |
+
|
46 |
+
# Get model and org
|
47 |
+
org_and_model = config.get("model_name", config.get("model_args", None))
|
48 |
+
org_and_model = org_and_model.split("/", 1)
|
49 |
+
|
50 |
+
if len(org_and_model) == 1:
|
51 |
+
org = None
|
52 |
+
model = org_and_model[0]
|
53 |
+
result_key = f"{model}_{precision.value.name}"
|
54 |
+
else:
|
55 |
+
org = org_and_model[0]
|
56 |
+
model = org_and_model[1]
|
57 |
+
result_key = f"{org}_{model}_{precision.value.name}"
|
58 |
+
full_model = "/".join(org_and_model)
|
59 |
+
|
60 |
+
still_on_hub, _, model_config = is_model_on_hub(
|
61 |
+
full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
|
62 |
+
)
|
63 |
+
architecture = "?"
|
64 |
+
if model_config is not None:
|
65 |
+
architectures = getattr(model_config, "architectures", None)
|
66 |
+
if architectures:
|
67 |
+
architecture = ";".join(architectures)
|
68 |
+
|
69 |
+
# Extract results available in this file (some results are split in several files)
|
70 |
+
results = {}
|
71 |
+
for task in Tasks:
|
72 |
+
task = task.value
|
73 |
+
|
74 |
+
# We average all scores of a given metric (not all metrics are present in all files)
|
75 |
+
accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
|
76 |
+
if accs.size == 0 or any([acc is None for acc in accs]):
|
77 |
+
continue
|
78 |
+
|
79 |
+
mean_acc = np.mean(accs) * 100.0
|
80 |
+
results[task.benchmark] = mean_acc
|
81 |
+
|
82 |
+
return self(
|
83 |
+
eval_name=result_key,
|
84 |
+
full_model=full_model,
|
85 |
+
org=org,
|
86 |
+
model=model,
|
87 |
+
results=results,
|
88 |
+
precision=precision,
|
89 |
+
revision= config.get("model_sha", ""),
|
90 |
+
still_on_hub=still_on_hub,
|
91 |
+
architecture=architecture
|
92 |
+
)
|
93 |
+
|
94 |
+
def update_with_request_file(self, requests_path):
|
95 |
+
"""Finds the relevant request file for the current model and updates info with it"""
|
96 |
+
request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
|
97 |
+
|
98 |
+
try:
|
99 |
+
with open(request_file, "r") as f:
|
100 |
+
request = json.load(f)
|
101 |
+
self.model_type = ModelType.from_str(request.get("model_type", ""))
|
102 |
+
self.weight_type = WeightType[request.get("weight_type", "Original")]
|
103 |
+
self.license = request.get("license", "?")
|
104 |
+
self.likes = request.get("likes", 0)
|
105 |
+
self.num_params = request.get("params", 0)
|
106 |
+
self.date = request.get("submitted_time", "")
|
107 |
+
except Exception:
|
108 |
+
print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
|
109 |
+
|
110 |
+
def to_dict(self):
|
111 |
+
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
112 |
+
average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
|
113 |
+
data_dict = {
|
114 |
+
"eval_name": self.eval_name, # not a column, just a save name,
|
115 |
+
AutoEvalColumn.precision.name: self.precision.value.name,
|
116 |
+
AutoEvalColumn.model_type.name: self.model_type.value.name,
|
117 |
+
AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
|
118 |
+
AutoEvalColumn.weight_type.name: self.weight_type.value.name,
|
119 |
+
AutoEvalColumn.architecture.name: self.architecture,
|
120 |
+
AutoEvalColumn.model.name: make_clickable_model(self.full_model),
|
121 |
+
AutoEvalColumn.revision.name: self.revision,
|
122 |
+
AutoEvalColumn.average.name: average,
|
123 |
+
AutoEvalColumn.license.name: self.license,
|
124 |
+
AutoEvalColumn.likes.name: self.likes,
|
125 |
+
AutoEvalColumn.params.name: self.num_params,
|
126 |
+
AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
127 |
+
}
|
128 |
+
|
129 |
+
for task in Tasks:
|
130 |
+
data_dict[task.value.col_name] = self.results[task.value.benchmark]
|
131 |
+
|
132 |
+
return data_dict
|
133 |
+
|
134 |
+
|
135 |
+
def get_request_file_for_model(requests_path, model_name, precision):
|
136 |
+
"""Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
|
137 |
+
request_files = os.path.join(
|
138 |
+
requests_path,
|
139 |
+
f"{model_name}_eval_request_*.json",
|
140 |
+
)
|
141 |
+
request_files = glob.glob(request_files)
|
142 |
+
|
143 |
+
# Select correct request file (precision)
|
144 |
+
request_file = ""
|
145 |
+
request_files = sorted(request_files, reverse=True)
|
146 |
+
for tmp_request_file in request_files:
|
147 |
+
with open(tmp_request_file, "r") as f:
|
148 |
+
req_content = json.load(f)
|
149 |
+
if (
|
150 |
+
req_content["status"] in ["FINISHED"]
|
151 |
+
and req_content["precision"] == precision.split(".")[-1]
|
152 |
+
):
|
153 |
+
request_file = tmp_request_file
|
154 |
+
return request_file
|
155 |
+
|
156 |
+
|
157 |
+
def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
|
158 |
+
"""From the path of the results folder root, extract all needed info for results"""
|
159 |
+
model_result_filepaths = []
|
160 |
+
|
161 |
+
for root, _, files in os.walk(results_path):
|
162 |
+
# We should only have json files in model results
|
163 |
+
if len(files) == 0 or any([not f.endswith(".json") for f in files]):
|
164 |
+
continue
|
165 |
+
|
166 |
+
# Sort the files by date
|
167 |
+
try:
|
168 |
+
files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
|
169 |
+
except dateutil.parser._parser.ParserError:
|
170 |
+
files = [files[-1]]
|
171 |
+
|
172 |
+
for file in files:
|
173 |
+
model_result_filepaths.append(os.path.join(root, file))
|
174 |
+
|
175 |
+
eval_results = {}
|
176 |
+
for model_result_filepath in model_result_filepaths:
|
177 |
+
# Creation of result
|
178 |
+
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
179 |
+
eval_result.update_with_request_file(requests_path)
|
180 |
+
|
181 |
+
# Store results of same eval together
|
182 |
+
eval_name = eval_result.eval_name
|
183 |
+
if eval_name in eval_results.keys():
|
184 |
+
eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
|
185 |
+
else:
|
186 |
+
eval_results[eval_name] = eval_result
|
187 |
+
|
188 |
+
results = []
|
189 |
+
for v in eval_results.values():
|
190 |
+
try:
|
191 |
+
v.to_dict() # we test if the dict version is complete
|
192 |
+
results.append(v)
|
193 |
+
except KeyError: # not all eval values present
|
194 |
+
continue
|
195 |
+
|
196 |
+
return results
|
src/populate.py
CHANGED
@@ -1,188 +1,58 @@
|
|
1 |
-
"""
|
2 |
-
Populate the CodeReview Bench leaderboard from HuggingFace datasets.
|
3 |
-
"""
|
4 |
-
|
5 |
import json
|
6 |
import os
|
7 |
-
import pandas as pd
|
8 |
-
import tempfile
|
9 |
-
from typing import Dict, List, Optional
|
10 |
-
from datetime import datetime
|
11 |
-
import numpy as np
|
12 |
-
|
13 |
-
from huggingface_hub import hf_hub_download, HfApi
|
14 |
-
from datasets import load_dataset
|
15 |
-
|
16 |
-
from src.display.utils import CODEREVIEW_COLUMN, DISPLAY_COLS, CATEGORIES
|
17 |
-
from src.envs import RESULTS_DATASET_ID, TOKEN, CACHE_PATH
|
18 |
-
from src.leaderboard.processor import leaderboard_to_dataframe
|
19 |
-
|
20 |
-
|
21 |
-
def get_latest_leaderboard(version="v0") -> Optional[Dict]:
|
22 |
-
"""
|
23 |
-
Get the latest leaderboard data from HuggingFace dataset.
|
24 |
-
Fallback to local JSON file if HF download fails or is unavailable.
|
25 |
-
"""
|
26 |
-
# First try to fetch from HuggingFace Hub
|
27 |
-
try:
|
28 |
-
leaderboard_path = hf_hub_download(
|
29 |
-
repo_id=RESULTS_DATASET_ID,
|
30 |
-
filename=f"leaderboards/leaderboard_{version}.json",
|
31 |
-
repo_type="dataset",
|
32 |
-
token=TOKEN
|
33 |
-
)
|
34 |
-
with open(leaderboard_path, 'r') as f:
|
35 |
-
return json.load(f)
|
36 |
-
except Exception as hf_err:
|
37 |
-
print(f"HF download failed or unavailable: {hf_err}. Trying local fallback...")
|
38 |
-
|
39 |
-
# Fallback: attempt to load a local leaderboard_data.json located at the project root
|
40 |
-
project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
41 |
-
local_path_candidates = [
|
42 |
-
os.path.join(project_root, "leaderboard_data.json"), # legacy path in root
|
43 |
-
os.path.join(project_root, "data", "leaderboard.json"), # path defined in envs.py
|
44 |
-
]
|
45 |
-
|
46 |
-
for local_path in local_path_candidates:
|
47 |
-
if os.path.exists(local_path):
|
48 |
-
try:
|
49 |
-
with open(local_path, 'r') as f:
|
50 |
-
return json.load(f)
|
51 |
-
except Exception as local_err:
|
52 |
-
print(f"Error loading local leaderboard file {local_path}: {local_err}")
|
53 |
-
|
54 |
-
# If nothing found, return None
|
55 |
-
return None
|
56 |
-
|
57 |
-
|
58 |
-
def get_model_entry(model_name: str, mode: str, version="v0") -> Optional[Dict]:
|
59 |
-
"""
|
60 |
-
Get a specific model's entry from the entries folder, uniquely identified by model_name, mode, and version.
|
61 |
-
"""
|
62 |
-
try:
|
63 |
-
model_name_safe = model_name.replace("/", "_").replace(" ", "_")
|
64 |
-
mode_safe = str(mode).replace("/", "_").replace(" ", "_").lower()
|
65 |
-
entry_path = hf_hub_download(
|
66 |
-
repo_id=RESULTS_DATASET_ID,
|
67 |
-
filename=f"entries/entry_{model_name_safe}_{mode_safe}_{version}.json",
|
68 |
-
repo_type="dataset",
|
69 |
-
token=TOKEN
|
70 |
-
)
|
71 |
-
with open(entry_path, 'r') as f:
|
72 |
-
return json.load(f)
|
73 |
-
except Exception as e:
|
74 |
-
print(f"Error downloading model entry: {e}")
|
75 |
-
return None
|
76 |
-
|
77 |
-
|
78 |
-
def get_all_entries(version="v0") -> List[Dict]:
|
79 |
-
"""
|
80 |
-
Get all entries from the HuggingFace dataset.
|
81 |
-
"""
|
82 |
-
try:
|
83 |
-
api = HfApi(token=TOKEN)
|
84 |
-
files = api.list_repo_files(repo_id=RESULTS_DATASET_ID, repo_type="dataset")
|
85 |
-
entry_files = [f for f in files if f.startswith("entries/") and f.endswith(f"_{version}.json")]
|
86 |
-
|
87 |
-
all_entries = []
|
88 |
-
for entry_file in entry_files:
|
89 |
-
try:
|
90 |
-
entry_path = hf_hub_download(
|
91 |
-
repo_id=RESULTS_DATASET_ID,
|
92 |
-
filename=entry_file,
|
93 |
-
repo_type="dataset",
|
94 |
-
token=TOKEN
|
95 |
-
)
|
96 |
-
with open(entry_path, 'r') as f:
|
97 |
-
entry_data = json.load(f)
|
98 |
-
all_entries.append(entry_data)
|
99 |
-
except Exception as e:
|
100 |
-
print(f"Error loading entry {entry_file}: {e}")
|
101 |
-
|
102 |
-
return all_entries
|
103 |
-
except Exception as e:
|
104 |
-
print(f"Error getting all entries: {e}")
|
105 |
-
return []
|
106 |
-
|
107 |
-
|
108 |
-
def get_leaderboard_df(version="v0") -> pd.DataFrame:
|
109 |
-
"""
|
110 |
-
Get the leaderboard data as a DataFrame.
|
111 |
-
"""
|
112 |
-
# Get latest leaderboard data
|
113 |
-
leaderboard_data = get_latest_leaderboard(version)
|
114 |
-
|
115 |
-
if not leaderboard_data:
|
116 |
-
# If no leaderboard exists, try to build it from entries
|
117 |
-
entries = get_all_entries(version)
|
118 |
-
if entries:
|
119 |
-
leaderboard_data = {
|
120 |
-
"entries": entries,
|
121 |
-
"last_updated": datetime.now().isoformat(),
|
122 |
-
"version": version
|
123 |
-
}
|
124 |
-
else:
|
125 |
-
# Return empty DataFrame if no data available
|
126 |
-
return pd.DataFrame(columns=DISPLAY_COLS)
|
127 |
-
|
128 |
-
# Convert to DataFrame
|
129 |
-
return leaderboard_to_dataframe(leaderboard_data)
|
130 |
-
|
131 |
-
|
132 |
-
def get_category_leaderboard_df(category: str, version="v0") -> pd.DataFrame:
|
133 |
-
"""
|
134 |
-
Get the leaderboard data filtered by a specific programming language category.
|
135 |
-
"""
|
136 |
-
# Get latest leaderboard data
|
137 |
-
leaderboard_data = get_latest_leaderboard(version)
|
138 |
-
|
139 |
-
if not leaderboard_data:
|
140 |
-
# If no leaderboard exists, try to build it from entries
|
141 |
-
entries = get_all_entries(version)
|
142 |
-
if entries:
|
143 |
-
leaderboard_data = {
|
144 |
-
"entries": entries,
|
145 |
-
"last_updated": datetime.now().isoformat(),
|
146 |
-
"version": version
|
147 |
-
}
|
148 |
-
else:
|
149 |
-
# Return empty DataFrame if no data available
|
150 |
-
return pd.DataFrame(columns=DISPLAY_COLS)
|
151 |
-
|
152 |
-
# Filter entries to only include those with data for the specified programming language
|
153 |
-
filtered_entries = []
|
154 |
-
for entry in leaderboard_data.get("entries", []):
|
155 |
-
# Check if entry has data for this programming language
|
156 |
-
programming_language = entry.get("programming_language", "").lower()
|
157 |
-
if programming_language == category.lower() or category.lower() == "other":
|
158 |
-
# For "other" category, include entries that don't match any specific language
|
159 |
-
if category.lower() == "other":
|
160 |
-
if programming_language not in [cat.lower() for cat in CATEGORIES[:-1]]: # Exclude "Other" from check
|
161 |
-
filtered_entries.append(entry)
|
162 |
-
else:
|
163 |
-
filtered_entries.append(entry)
|
164 |
-
|
165 |
-
# Create a new leaderboard data structure with the filtered entries
|
166 |
-
filtered_leaderboard = {
|
167 |
-
"entries": filtered_entries,
|
168 |
-
"last_updated": leaderboard_data.get("last_updated", datetime.now().isoformat()),
|
169 |
-
"version": version
|
170 |
-
}
|
171 |
-
|
172 |
-
# Convert to DataFrame
|
173 |
-
return leaderboard_to_dataframe(filtered_leaderboard)
|
174 |
|
|
|
175 |
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import json
|
2 |
import os
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
|
4 |
+
import pandas as pd
|
5 |
|
6 |
+
from src.display.formatting import has_no_nan_values, make_clickable_model
|
7 |
+
from src.display.utils import AutoEvalColumn, EvalQueueColumn
|
8 |
+
from src.leaderboard.read_evals import get_raw_eval_results
|
9 |
+
|
10 |
+
|
11 |
+
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
12 |
+
"""Creates a dataframe from all the individual experiment results"""
|
13 |
+
raw_data = get_raw_eval_results(results_path, requests_path)
|
14 |
+
all_data_json = [v.to_dict() for v in raw_data]
|
15 |
+
|
16 |
+
df = pd.DataFrame.from_records(all_data_json)
|
17 |
+
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
18 |
+
df = df[cols].round(decimals=2)
|
19 |
+
|
20 |
+
# filter out if any of the benchmarks have not been produced
|
21 |
+
df = df[has_no_nan_values(df, benchmark_cols)]
|
22 |
+
return df
|
23 |
+
|
24 |
+
|
25 |
+
def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
26 |
+
"""Creates the different dataframes for the evaluation queues requestes"""
|
27 |
+
entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
|
28 |
+
all_evals = []
|
29 |
+
|
30 |
+
for entry in entries:
|
31 |
+
if ".json" in entry:
|
32 |
+
file_path = os.path.join(save_path, entry)
|
33 |
+
with open(file_path) as fp:
|
34 |
+
data = json.load(fp)
|
35 |
+
|
36 |
+
data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
|
37 |
+
data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
38 |
+
|
39 |
+
all_evals.append(data)
|
40 |
+
elif ".md" not in entry:
|
41 |
+
# this is a folder
|
42 |
+
sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if os.path.isfile(e) and not e.startswith(".")]
|
43 |
+
for sub_entry in sub_entries:
|
44 |
+
file_path = os.path.join(save_path, entry, sub_entry)
|
45 |
+
with open(file_path) as fp:
|
46 |
+
data = json.load(fp)
|
47 |
+
|
48 |
+
data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
|
49 |
+
data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
50 |
+
all_evals.append(data)
|
51 |
+
|
52 |
+
pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
|
53 |
+
running_list = [e for e in all_evals if e["status"] == "RUNNING"]
|
54 |
+
finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
|
55 |
+
df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
|
56 |
+
df_running = pd.DataFrame.from_records(running_list, columns=cols)
|
57 |
+
df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
|
58 |
+
return df_finished[cols], df_running[cols], df_pending[cols]
|
src/submission/__init__.py
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
# Submission handling module
|
|
|
|
src/submission/check_validity.py
ADDED
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
import re
|
4 |
+
from collections import defaultdict
|
5 |
+
from datetime import datetime, timedelta, timezone
|
6 |
+
|
7 |
+
import huggingface_hub
|
8 |
+
from huggingface_hub import ModelCard
|
9 |
+
from huggingface_hub.hf_api import ModelInfo
|
10 |
+
from transformers import AutoConfig
|
11 |
+
from transformers.models.auto.tokenization_auto import AutoTokenizer
|
12 |
+
|
13 |
+
def check_model_card(repo_id: str) -> tuple[bool, str]:
|
14 |
+
"""Checks if the model card and license exist and have been filled"""
|
15 |
+
try:
|
16 |
+
card = ModelCard.load(repo_id)
|
17 |
+
except huggingface_hub.utils.EntryNotFoundError:
|
18 |
+
return False, "Please add a model card to your model to explain how you trained/fine-tuned it."
|
19 |
+
|
20 |
+
# Enforce license metadata
|
21 |
+
if card.data.license is None:
|
22 |
+
if not ("license_name" in card.data and "license_link" in card.data):
|
23 |
+
return False, (
|
24 |
+
"License not found. Please add a license to your model card using the `license` metadata or a"
|
25 |
+
" `license_name`/`license_link` pair."
|
26 |
+
)
|
27 |
+
|
28 |
+
# Enforce card content
|
29 |
+
if len(card.text) < 200:
|
30 |
+
return False, "Please add a description to your model card, it is too short."
|
31 |
+
|
32 |
+
return True, ""
|
33 |
+
|
34 |
+
def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str]:
|
35 |
+
"""Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses."""
|
36 |
+
try:
|
37 |
+
config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
|
38 |
+
if test_tokenizer:
|
39 |
+
try:
|
40 |
+
tk = AutoTokenizer.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
|
41 |
+
except ValueError as e:
|
42 |
+
return (
|
43 |
+
False,
|
44 |
+
f"uses a tokenizer which is not in a transformers release: {e}",
|
45 |
+
None
|
46 |
+
)
|
47 |
+
except Exception as e:
|
48 |
+
return (False, "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?", None)
|
49 |
+
return True, None, config
|
50 |
+
|
51 |
+
except ValueError:
|
52 |
+
return (
|
53 |
+
False,
|
54 |
+
"needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
|
55 |
+
None
|
56 |
+
)
|
57 |
+
|
58 |
+
except Exception as e:
|
59 |
+
return False, "was not found on hub!", None
|
60 |
+
|
61 |
+
|
62 |
+
def get_model_size(model_info: ModelInfo, precision: str):
|
63 |
+
"""Gets the model size from the configuration, or the model name if the configuration does not contain the information."""
|
64 |
+
try:
|
65 |
+
model_size = round(model_info.safetensors["total"] / 1e9, 3)
|
66 |
+
except (AttributeError, TypeError):
|
67 |
+
return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
|
68 |
+
|
69 |
+
size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.modelId.lower()) else 1
|
70 |
+
model_size = size_factor * model_size
|
71 |
+
return model_size
|
72 |
+
|
73 |
+
def get_model_arch(model_info: ModelInfo):
|
74 |
+
"""Gets the model architecture from the configuration"""
|
75 |
+
return model_info.config.get("architectures", "Unknown")
|
76 |
+
|
77 |
+
def already_submitted_models(requested_models_dir: str) -> set[str]:
|
78 |
+
"""Gather a list of already submitted models to avoid duplicates"""
|
79 |
+
depth = 1
|
80 |
+
file_names = []
|
81 |
+
users_to_submission_dates = defaultdict(list)
|
82 |
+
|
83 |
+
for root, _, files in os.walk(requested_models_dir):
|
84 |
+
current_depth = root.count(os.sep) - requested_models_dir.count(os.sep)
|
85 |
+
if current_depth == depth:
|
86 |
+
for file in files:
|
87 |
+
if not file.endswith(".json"):
|
88 |
+
continue
|
89 |
+
with open(os.path.join(root, file), "r") as f:
|
90 |
+
info = json.load(f)
|
91 |
+
file_names.append(f"{info['model']}_{info['revision']}_{info['precision']}")
|
92 |
+
|
93 |
+
# Select organisation
|
94 |
+
if info["model"].count("/") == 0 or "submitted_time" not in info:
|
95 |
+
continue
|
96 |
+
organisation, _ = info["model"].split("/")
|
97 |
+
users_to_submission_dates[organisation].append(info["submitted_time"])
|
98 |
+
|
99 |
+
return set(file_names), users_to_submission_dates
|
src/submission/submit.py
CHANGED
@@ -1,184 +1,119 @@
|
|
1 |
-
"""
|
2 |
-
Handle submissions to the CodeReview Bench leaderboard.
|
3 |
-
"""
|
4 |
-
|
5 |
import json
|
6 |
import os
|
7 |
-
import
|
8 |
-
|
9 |
-
from
|
10 |
-
|
11 |
-
from
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
|
|
|
18 |
|
19 |
-
|
20 |
-
"""
|
21 |
-
Validate a submission file.
|
22 |
-
"""
|
23 |
-
try:
|
24 |
-
entries, message = process_jsonl_submission(file_path)
|
25 |
-
if not entries:
|
26 |
-
return False, message
|
27 |
-
return True, "Submission is valid"
|
28 |
-
except Exception as e:
|
29 |
-
return False, f"Error validating submission: {e}"
|
30 |
-
|
31 |
-
|
32 |
-
def submit_entry_to_hub(entry: Dict, model_name: str, mode: str, version="v0") -> Tuple[bool, str]:
|
33 |
-
"""
|
34 |
-
Submit a model's evaluation entry to the HuggingFace dataset. The entry is uniquely identified by model_name, mode, and version.
|
35 |
-
"""
|
36 |
-
try:
|
37 |
-
# Create safe model name for file path
|
38 |
-
model_name_safe = model_name.replace("/", "_").replace(" ", "_")
|
39 |
-
mode_safe = str(mode).replace("/", "_").replace(" ", "_").lower()
|
40 |
-
|
41 |
-
# Create entry path in entries folder
|
42 |
-
entry_path = f"entries/entry_{model_name_safe}_{mode_safe}_{version}.json"
|
43 |
-
|
44 |
-
# Save entry to temporary file
|
45 |
-
with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as temp_file:
|
46 |
-
json.dump(entry, temp_file, indent=2)
|
47 |
-
temp_path = temp_file.name
|
48 |
-
|
49 |
-
# Upload file
|
50 |
-
api = HfApi(token=TOKEN)
|
51 |
-
api.upload_file(
|
52 |
-
path_or_fileobj=temp_path,
|
53 |
-
path_in_repo=entry_path,
|
54 |
-
repo_id=RESULTS_DATASET_ID,
|
55 |
-
repo_type="dataset",
|
56 |
-
commit_message=f"Add evaluation entry for {model_name} (mode {mode}, version {version})"
|
57 |
-
)
|
58 |
-
|
59 |
-
os.unlink(temp_path)
|
60 |
-
return True, f"Successfully uploaded evaluation entry for {model_name} (mode {mode})"
|
61 |
-
except Exception as e:
|
62 |
-
return False, f"Error submitting entry to dataset: {e}"
|
63 |
-
|
64 |
-
|
65 |
-
def submit_leaderboard_to_hub(entries: List[Dict], version="v0") -> Tuple[bool, str]:
|
66 |
-
"""
|
67 |
-
Submit updated leaderboard to the HuggingFace dataset.
|
68 |
-
"""
|
69 |
-
try:
|
70 |
-
# Create leaderboard data
|
71 |
-
leaderboard_data = {
|
72 |
-
"entries": entries,
|
73 |
-
"last_updated": datetime.now().isoformat(),
|
74 |
-
"version": version
|
75 |
-
}
|
76 |
-
|
77 |
-
# Save to temporary file
|
78 |
-
with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as temp_file:
|
79 |
-
json.dump(leaderboard_data, temp_file, indent=2)
|
80 |
-
temp_path = temp_file.name
|
81 |
-
|
82 |
-
# Upload file
|
83 |
-
api = HfApi(token=TOKEN)
|
84 |
-
api.upload_file(
|
85 |
-
path_or_fileobj=temp_path,
|
86 |
-
path_in_repo=f"leaderboards/leaderboard_{version}.json",
|
87 |
-
repo_id=RESULTS_DATASET_ID,
|
88 |
-
repo_type="dataset",
|
89 |
-
commit_message=f"Update leaderboard for version {version}"
|
90 |
-
)
|
91 |
-
|
92 |
-
os.unlink(temp_path)
|
93 |
-
return True, "Leaderboard updated successfully"
|
94 |
-
except Exception as e:
|
95 |
-
return False, f"Error updating leaderboard: {e}"
|
96 |
-
|
97 |
-
|
98 |
-
def process_submission(file_path: str, metadata: Dict, version="v0") -> str:
|
99 |
-
"""
|
100 |
-
Process a submission to the CodeReview Bench leaderboard.
|
101 |
-
"""
|
102 |
try:
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
for entry_file in entry_files:
|
157 |
-
try:
|
158 |
-
entry_path = api.hf_hub_download(
|
159 |
-
repo_id=RESULTS_DATASET_ID,
|
160 |
-
filename=entry_file,
|
161 |
-
repo_type="dataset",
|
162 |
-
)
|
163 |
-
with open(entry_path, 'r') as f:
|
164 |
-
entry_data = json.load(f)
|
165 |
-
all_entries.append(entry_data)
|
166 |
-
except Exception as e:
|
167 |
-
print(f"Error loading entry {entry_file}: {e}")
|
168 |
-
|
169 |
-
# Update leaderboard with all entries
|
170 |
-
success, message = submit_leaderboard_to_hub(all_entries, version)
|
171 |
-
if not success:
|
172 |
-
return styled_error(message)
|
173 |
-
|
174 |
-
return styled_message("Submission successful! Model evaluated and leaderboard updated.")
|
175 |
-
|
176 |
-
except Exception as e:
|
177 |
-
return styled_error(f"Error processing submission: {e}")
|
178 |
-
finally:
|
179 |
-
# Clean up temporary files if they exist
|
180 |
-
try:
|
181 |
-
if os.path.exists(file_path):
|
182 |
-
os.remove(file_path)
|
183 |
-
except:
|
184 |
-
pass
|
|
|
|
|
|
|
|
|
|
|
1 |
import json
|
2 |
import os
|
3 |
+
from datetime import datetime, timezone
|
4 |
+
|
5 |
+
from src.display.formatting import styled_error, styled_message, styled_warning
|
6 |
+
from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO
|
7 |
+
from src.submission.check_validity import (
|
8 |
+
already_submitted_models,
|
9 |
+
check_model_card,
|
10 |
+
get_model_size,
|
11 |
+
is_model_on_hub,
|
12 |
+
)
|
13 |
+
|
14 |
+
REQUESTED_MODELS = None
|
15 |
+
USERS_TO_SUBMISSION_DATES = None
|
16 |
+
|
17 |
+
def add_new_eval(
|
18 |
+
model: str,
|
19 |
+
base_model: str,
|
20 |
+
revision: str,
|
21 |
+
precision: str,
|
22 |
+
weight_type: str,
|
23 |
+
model_type: str,
|
24 |
+
):
|
25 |
+
global REQUESTED_MODELS
|
26 |
+
global USERS_TO_SUBMISSION_DATES
|
27 |
+
if not REQUESTED_MODELS:
|
28 |
+
REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
|
29 |
+
|
30 |
+
user_name = ""
|
31 |
+
model_path = model
|
32 |
+
if "/" in model:
|
33 |
+
user_name = model.split("/")[0]
|
34 |
+
model_path = model.split("/")[1]
|
35 |
+
|
36 |
+
precision = precision.split(" ")[0]
|
37 |
+
current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
38 |
+
|
39 |
+
if model_type is None or model_type == "":
|
40 |
+
return styled_error("Please select a model type.")
|
41 |
+
|
42 |
+
# Does the model actually exist?
|
43 |
+
if revision == "":
|
44 |
+
revision = "main"
|
45 |
+
|
46 |
+
# Is the model on the hub?
|
47 |
+
if weight_type in ["Delta", "Adapter"]:
|
48 |
+
base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=TOKEN, test_tokenizer=True)
|
49 |
+
if not base_model_on_hub:
|
50 |
+
return styled_error(f'Base model "{base_model}" {error}')
|
51 |
+
|
52 |
+
if not weight_type == "Adapter":
|
53 |
+
model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, token=TOKEN, test_tokenizer=True)
|
54 |
+
if not model_on_hub:
|
55 |
+
return styled_error(f'Model "{model}" {error}')
|
56 |
+
|
57 |
+
# Is the model info correctly filled?
|
58 |
+
try:
|
59 |
+
model_info = API.model_info(repo_id=model, revision=revision)
|
60 |
+
except Exception:
|
61 |
+
return styled_error("Could not get your model information. Please fill it up properly.")
|
62 |
|
63 |
+
model_size = get_model_size(model_info=model_info, precision=precision)
|
64 |
|
65 |
+
# Were the model card and license filled?
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
try:
|
67 |
+
license = model_info.cardData["license"]
|
68 |
+
except Exception:
|
69 |
+
return styled_error("Please select a license for your model")
|
70 |
+
|
71 |
+
modelcard_OK, error_msg = check_model_card(model)
|
72 |
+
if not modelcard_OK:
|
73 |
+
return styled_error(error_msg)
|
74 |
+
|
75 |
+
# Seems good, creating the eval
|
76 |
+
print("Adding new eval")
|
77 |
+
|
78 |
+
eval_entry = {
|
79 |
+
"model": model,
|
80 |
+
"base_model": base_model,
|
81 |
+
"revision": revision,
|
82 |
+
"precision": precision,
|
83 |
+
"weight_type": weight_type,
|
84 |
+
"status": "PENDING",
|
85 |
+
"submitted_time": current_time,
|
86 |
+
"model_type": model_type,
|
87 |
+
"likes": model_info.likes,
|
88 |
+
"params": model_size,
|
89 |
+
"license": license,
|
90 |
+
"private": False,
|
91 |
+
}
|
92 |
+
|
93 |
+
# Check for duplicate submission
|
94 |
+
if f"{model}_{revision}_{precision}" in REQUESTED_MODELS:
|
95 |
+
return styled_warning("This model has been already submitted.")
|
96 |
+
|
97 |
+
print("Creating eval file")
|
98 |
+
OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
|
99 |
+
os.makedirs(OUT_DIR, exist_ok=True)
|
100 |
+
out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json"
|
101 |
+
|
102 |
+
with open(out_path, "w") as f:
|
103 |
+
f.write(json.dumps(eval_entry))
|
104 |
+
|
105 |
+
print("Uploading eval file")
|
106 |
+
API.upload_file(
|
107 |
+
path_or_fileobj=out_path,
|
108 |
+
path_in_repo=out_path.split("eval-queue/")[1],
|
109 |
+
repo_id=QUEUE_REPO,
|
110 |
+
repo_type="dataset",
|
111 |
+
commit_message=f"Add {model} to eval queue",
|
112 |
+
)
|
113 |
+
|
114 |
+
# Remove the local file
|
115 |
+
os.remove(out_path)
|
116 |
+
|
117 |
+
return styled_message(
|
118 |
+
"Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list."
|
119 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|