.env.template DELETED
@@ -1,6 +0,0 @@
1
- HF_TOKEN="your_huggingface_write_token"
2
- OWNER="your_huggingface_username_or_org"
3
- RESULTS_DATASET_ID="your_username/guardbench-results"
4
- SUBMITTER_TOKEN="your_secret_submission_token"
5
- ADMIN_USERNAME="admin"
6
- ADMIN_PASSWORD="password" # Change this!
 
 
 
 
 
 
 
.gitignore CHANGED
@@ -1,52 +1,13 @@
1
- # Python
 
2
  __pycache__/
3
- *.py[cod]
4
- *$py.class
5
- *.so
6
- .Python
7
- env/
8
- build/
9
- develop-eggs/
10
- dist/
11
- downloads/
12
- eggs/
13
- .eggs/
14
- lib/
15
- lib64/
16
- parts/
17
- sdist/
18
- var/
19
- .venv/
20
- *.egg-info/
21
- .installed.cfg
22
- *.egg
23
- .gradio/
24
-
25
- # Environment variables
26
  .env
27
-
28
- # Virtual Environment
29
- venv/
30
- ENV/
31
-
32
- # IDE
33
- .idea/
34
  .vscode/
35
- *.swp
36
- *.swo
37
-
38
- # OS
39
- .DS_Store
40
- Thumbs.db
41
 
42
- # Hugging Face cache
43
  eval-queue/
44
  eval-results/
45
  eval-queue-bk/
46
  eval-results-bk/
47
-
48
- # Data files
49
- data/
50
-
51
- # Versioned leaderboard files
52
- data/leaderboard_v*.json
 
1
+ auto_evals/
2
+ venv/
3
  __pycache__/
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  .env
5
+ .ipynb_checkpoints
6
+ *ipynb
 
 
 
 
 
7
  .vscode/
 
 
 
 
 
 
8
 
 
9
  eval-queue/
10
  eval-results/
11
  eval-queue-bk/
12
  eval-results-bk/
13
+ logs/
 
 
 
 
 
.gitmodules DELETED
@@ -1,3 +0,0 @@
1
- [submodule "guard-bench-submodule"]
2
- path = guard-bench-submodule
3
- url = https://github.com/whitecircle-ai/circle-guard-bench.git
 
 
 
 
.gradio/certificate.pem DELETED
@@ -1,31 +0,0 @@
1
- -----BEGIN CERTIFICATE-----
2
- MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
3
- TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
4
- cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
5
- WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
6
- ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
7
- MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
8
- h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
9
- 0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
10
- A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
11
- T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
12
- B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
13
- B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
14
- KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
15
- OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
16
- jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
17
- qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
18
- rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
19
- HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
20
- hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
21
- ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
22
- 3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
23
- NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
24
- ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
25
- TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
26
- jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
27
- oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
28
- 4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
29
- mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
30
- emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
31
- -----END CERTIFICATE-----
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
README.md CHANGED
@@ -1,227 +1,46 @@
1
  ---
2
  title: CodeReviewBench
3
- emoji: 😎
4
- colorFrom: gray
5
  colorTo: indigo
6
  sdk: gradio
7
-
8
- sdk_version: 4.44.1
9
  app_file: app.py
10
  pinned: true
11
- short_description: A comprehensive benchmark for codereview.
12
- models:
13
- - openai/gpt-4o-mini
14
- - openai/gpt-4o
15
- - claude-3-7-sonnet
16
- - deepseek/deepseek-r1
17
-
18
  ---
19
 
20
- # CodeReview Bench Leaderboard
21
-
22
- A comprehensive benchmark and leaderboard for code review generation models, inspired by [CodeReviewBench](https://huggingface.co/spaces/your-org/CodeReviewBench).
23
- ## Features
24
-
25
- - **Multi-Language Support**: Evaluates models across 17+ programming languages including Python, JavaScript, Java, C++, TypeScript, Go, Rust, and more
26
- - **Dual Language Comments**: Supports both Russian and English comment languages
27
- - **Comprehensive Metrics**:
28
- - LLM-based multimetric evaluation (readability, relevance, explanation clarity, problem identification, actionability, completeness, specificity, contextual adequacy, consistency, brevity)
29
- - Exact-match metrics (pass@1, pass@5, pass@10, BLEU@10)
30
- - **Interactive Visualization**: Compare model performance across categories with radar plots
31
- - **Easy Submission**: Submit your model results via web interface
32
-
33
- ## Metrics
34
-
35
- ### LLM-based Multimetric
36
-
37
- - **Readability**: How easy the review is to understand
38
- - **Relevance**: How relevant the review is to the code
39
- - **Explanation Clarity**: How clear the explanations are
40
- - **Problem Identification**: How well problems are identified
41
- - **Actionability**: How actionable the suggestions are
42
- - **Completeness**: How complete the review is
43
- - **Specificity**: How specific the feedback is
44
- - **Contextual Adequacy**: How well the review fits the context
45
- - **Consistency**: How consistent the review style is
46
- - **Brevity**: How concise the review is
47
-
48
- ### Exact-Match Metrics
49
-
50
- - **Pass@1**: Percentage of correct reviews on first attempt
51
- - **Pass@5**: Percentage of correct reviews in top 5 attempts
52
- - **Pass@10**: Percentage of correct reviews in top 10 attempts
53
- - **BLEU@10**: BLEU score for top 10 review candidates
54
-
55
- ## Programming Languages Supported
56
-
57
- - Python
58
- - JavaScript
59
- - Java
60
- - C++
61
- - C#
62
- - TypeScript
63
- - Go
64
- - Rust
65
- - Swift
66
- - Kotlin
67
- - Ruby
68
- - PHP
69
- - C
70
- - Scala
71
- - R
72
- - Dart
73
- - Other
74
-
75
- ## Comment Languages
76
-
77
- - Russian (ru)
78
- - English (en)
79
 
80
- ## Example Categories
81
-
82
- - Bug Fix
83
- - Code Style
84
- - Performance
85
- - Security
86
- - Refactoring
87
- - Documentation
88
- - Testing
89
- - Architecture
90
- - Other
91
-
92
- ## Installation
93
-
94
- ```bash
95
- pip install -r requirements.txt
96
- ```
97
-
98
- ## Usage
99
-
100
- ```bash
101
- python app.py
102
- ```
103
-
104
- ## Submission Format
105
-
106
- Submit your results as a JSONL file where each line contains:
107
 
 
108
  ```json
109
  {
110
- "model_name": "your-model-name",
111
- "programming_language": "python",
112
- "comment_language": "en",
113
- "readability": 8.5,
114
- "relevance": 9.0,
115
- "explanation_clarity": 7.8,
116
- "problem_identification": 8.2,
117
- "actionability": 8.7,
118
- "completeness": 8.0,
119
- "specificity": 7.5,
120
- "contextual_adequacy": 8.3,
121
- "consistency": 8.8,
122
- "brevity": 7.2,
123
- "pass_at_1": 0.75,
124
- "pass_at_5": 0.88,
125
- "pass_at_10": 0.92,
126
- "bleu_at_10": 0.65,
127
- "total_evaluations": 100
128
  }
129
  ```
130
 
131
- ## Environment Variables
132
-
133
- Set the following environment variables:
134
-
135
-
136
- ## Citation
137
 
138
- <<<<<<< HEAD
139
- - **Multi-tab Interface**: Organized navigation with dedicated sections
140
- - **Advanced Filtering**: Real-time filtering by multiple criteria
141
- - **Dark Theme**: Modern, GitHub-inspired dark interface
142
- - **IP-based Submissions**: Secure submission tracking
143
- - **Comprehensive Analytics**: Detailed performance insights
144
- - **Data Export**: Multiple export formats
145
- - **Rate Limiting**: Anti-spam protection
146
 
147
- ### 🔧 Technical Improvements
148
-
149
- - **Modular Architecture**: Clean separation of concerns
150
- - **Type Safety**: Full type annotations throughout
151
- - **Error Handling**: Comprehensive error handling and logging
152
- - **Data Validation**: Multi-layer validation with Pydantic
153
- - **Performance**: Optimized data processing and display
154
-
155
- ## 📈 Metrics & Evaluation
156
-
157
- ### Performance Metrics
158
-
159
- - **BLEU**: Text similarity score (0.0-1.0)
160
- - **Pass@1**: Success rate in single attempt (0.0-1.0)
161
- - **Pass@5**: Success rate in 5 attempts (0.0-1.0)
162
- - **Pass@10**: Success rate in 10 attempts (0.0-1.0)
163
-
164
- ### Quality Dimensions
165
-
166
- 1. **Readability**: How clear and readable are the reviews?
167
- 2. **Relevance**: How relevant to the code changes?
168
- 3. **Explanation Clarity**: How well does it explain issues?
169
- 4. **Problem Identification**: How effectively does it identify problems?
170
- 5. **Actionability**: How actionable are the suggestions?
171
- 6. **Completeness**: How thorough are the reviews?
172
- 7. **Specificity**: How specific are the comments?
173
- 8. **Contextual Adequacy**: How well does it understand context?
174
- 9. **Consistency**: How consistent across different reviews?
175
- 10. **Brevity**: How concise without losing important information?
176
-
177
- ## 🔒 Security Features
178
-
179
- ### Rate Limiting
180
-
181
- - **5 submissions per IP per 24 hours**
182
- - **Automatic IP tracking and logging**
183
- - **Graceful error handling for rate limits**
184
-
185
- ### Data Validation
186
-
187
- - **Model name format validation**
188
- - **Score range validation (0.0-1.0 for performance, 0-10 for quality)**
189
- - **Logical consistency checks (Pass@1 ≤ Pass@5 ≤ Pass@10)**
190
- - **Required field validation**
191
-
192
- ### Audit Trail
193
-
194
- - **Complete submission logging**
195
- - **IP address tracking (partially masked for privacy)**
196
- - **Timestamp recording**
197
- - **Data integrity checks**
198
-
199
- ## 🤝 Contributing
200
-
201
- 1. Fork the repository
202
- 2. Create a feature branch
203
- 3. Make your changes
204
- 4. Add tests if applicable
205
- 5. Submit a pull request
206
-
207
- ## 📄 License
208
-
209
- This project is licensed under the MIT License - see the LICENSE file for details.
210
-
211
- ## 🙏 Acknowledgments
212
-
213
- - Inspired by [CodeReviewBench](https://huggingface.co/spaces/your-org/CodeReviewBench)
214
- - Built with [Gradio](https://gradio.app/) for the web interface
215
- - Thanks to the open-source community for tools and inspiration
216
-
217
- ## 📞 Support
218
-
219
- For questions, issues, or contributions:
220
-
221
- - Open an issue on GitHub
222
- - Check the documentation
223
- - Contact the maintainers
224
-
225
- ---
226
 
227
- **Built with ❤️ for the code review research community**
 
 
 
 
1
  ---
2
  title: CodeReviewBench
3
+ emoji: 🥇
4
+ colorFrom: green
5
  colorTo: indigo
6
  sdk: gradio
 
 
7
  app_file: app.py
8
  pinned: true
9
+ license: mit
10
+ short_description: Result of benchmark presented in paper CodeReviewBench
11
+ sdk_version: 5.19.0
 
 
 
 
12
  ---
13
 
14
+ # Start the configuration
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
+ Most of the variables to change for a default leaderboard are in `src/env.py` (replace the path for your leaderboard) and `src/about.py` (for tasks).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
+ Results files should have the following format and be stored as json files:
19
  ```json
20
  {
21
+ "config": {
22
+ "model_dtype": "torch.float16", # or torch.bfloat16 or 8bit or 4bit
23
+ "model_name": "path of the model on the hub: org/model",
24
+ "model_sha": "revision on the hub",
25
+ },
26
+ "results": {
27
+ "task_name": {
28
+ "metric_name": score,
29
+ },
30
+ "task_name2": {
31
+ "metric_name": score,
32
+ }
33
+ }
 
 
 
 
 
34
  }
35
  ```
36
 
37
+ Request files are created automatically by this tool.
 
 
 
 
 
38
 
39
+ If you encounter problem on the space, don't hesitate to restart it to remove the create eval-queue, eval-queue-bk, eval-results and eval-results-bk created folder.
 
 
 
 
 
 
 
40
 
41
+ # Code logic for more complex edits
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
+ You'll find
44
+ - the main table' columns names and properties in `src/display/utils.py`
45
+ - the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
46
+ - the logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`
SUBMISSION_EXAMPLE.md DELETED
@@ -1,266 +0,0 @@
1
- # 📝 Model Submission Example
2
-
3
- This guide shows you exactly how to submit your code review model to the leaderboard.
4
-
5
- ## 🚀 Step-by-Step Submission Process
6
-
7
- ### 1. **Access the Submission Form**
8
-
9
- - Open the CodeReview Leaderboard in your browser
10
- - Navigate to the **📝 Submit Model** tab
11
- - Click on the "📝 Submit New Model Results" accordion to expand the form
12
-
13
- ### 2. **Fill in Basic Information**
14
-
15
- #### **Model Name** ✨
16
-
17
- ```
18
- Example: microsoft/CodeT5-base
19
- Format: organization/model-name
20
- ```
21
-
22
- #### **Programming Language** 🔍
23
-
24
- ```
25
- Select: Python
26
- (or Java, JavaScript, C++, Go, Rust, etc.)
27
- ```
28
-
29
- #### **Comment Language** 🌍
30
-
31
- ```
32
- Select: English
33
- (or Chinese, Spanish, French, German, etc.)
34
- ```
35
-
36
- #### **Taxonomy Category** 🏷️
37
-
38
- ```
39
- Select: Bug Detection
40
- (or Security, Performance, Code Style, etc.)
41
- ```
42
-
43
- ### 3. **Performance Scores** (0.0 - 1.0)
44
-
45
- #### **BLEU Score**
46
-
47
- ```
48
- Example: 0.742
49
- Range: 0.0 to 1.0
50
- Description: Measures similarity between generated and reference reviews
51
- ```
52
-
53
- #### **Pass@1**
54
-
55
- ```
56
- Example: 0.685
57
- Range: 0.0 to 1.0
58
- Description: Success rate when model gets 1 attempt
59
- ```
60
-
61
- #### **Pass@5**
62
-
63
- ```
64
- Example: 0.834
65
- Range: 0.0 to 1.0
66
- Description: Success rate when model gets 5 attempts
67
- ```
68
-
69
- #### **Pass@10**
70
-
71
- ```
72
- Example: 0.901
73
- Range: 0.0 to 1.0
74
- Description: Success rate when model gets 10 attempts
75
- ```
76
-
77
- ### 4. **Quality Metrics** (0 - 10)
78
-
79
- Rate your model across these 10 dimensions:
80
-
81
- #### **Readability: 8**
82
-
83
- ```
84
- How clear and readable are the generated code reviews?
85
- Scale: 0 (unreadable) to 10 (very clear)
86
- ```
87
-
88
- #### **Relevance: 7**
89
-
90
- ```
91
- How relevant are the reviews to the actual code changes?
92
- Scale: 0 (irrelevant) to 10 (highly relevant)
93
- ```
94
-
95
- #### **Explanation Clarity: 8**
96
-
97
- ```
98
- How well does the model explain identified issues?
99
- Scale: 0 (unclear) to 10 (very clear explanations)
100
- ```
101
-
102
- #### **Problem Identification: 7**
103
-
104
- ```
105
- How effectively does it identify real code problems?
106
- Scale: 0 (misses issues) to 10 (finds all problems)
107
- ```
108
-
109
- #### **Actionability: 6**
110
-
111
- ```
112
- How actionable and useful are the suggestions?
113
- Scale: 0 (not actionable) to 10 (very actionable)
114
- ```
115
-
116
- #### **Completeness: 7**
117
-
118
- ```
119
- How thorough and complete are the reviews?
120
- Scale: 0 (incomplete) to 10 (comprehensive)
121
- ```
122
-
123
- #### **Specificity: 6**
124
-
125
- ```
126
- How specific are the comments and suggestions?
127
- Scale: 0 (too generic) to 10 (very specific)
128
- ```
129
-
130
- #### **Contextual Adequacy: 7**
131
-
132
- ```
133
- How well does it understand the code context?
134
- Scale: 0 (ignores context) to 10 (perfect context understanding)
135
- ```
136
-
137
- #### **Consistency: 6**
138
-
139
- ```
140
- How consistent is the model across different code reviews?
141
- Scale: 0 (inconsistent) to 10 (very consistent)
142
- ```
143
-
144
- #### **Brevity: 5**
145
-
146
- ```
147
- How concise are the reviews without losing important information?
148
- Scale: 0 (too verbose/too brief) to 10 (perfect length)
149
- ```
150
-
151
- ### 5. **Submit Your Model**
152
-
153
- - Click the **🚀 Submit Model** button
154
- - Wait for validation and processing
155
- - Check for success/error message
156
-
157
- ## 📋 Complete Example Submission
158
-
159
- Here's a real example of submitting the CodeT5-base model:
160
-
161
- ```yaml
162
- Model Information:
163
- Model Name: "microsoft/CodeT5-base"
164
- Programming Language: "Python"
165
- Comment Language: "English"
166
- Taxonomy Category: "Bug Detection"
167
-
168
- Performance Scores:
169
- BLEU Score: 0.742
170
- Pass@1: 0.685
171
- Pass@5: 0.834
172
- Pass@10: 0.901
173
-
174
- Quality Metrics:
175
- Readability: 8
176
- Relevance: 7
177
- Explanation Clarity: 8
178
- Problem Identification: 7
179
- Actionability: 6
180
- Completeness: 7
181
- Specificity: 6
182
- Contextual Adequacy: 7
183
- Consistency: 6
184
- Brevity: 5
185
- ```
186
-
187
- ## 🔒 Security & Rate Limiting
188
-
189
- ### **IP-based Rate Limiting**
190
-
191
- - **5 submissions per IP address per 24 hours**
192
- - Submissions are tracked by your IP address
193
- - Rate limit resets every 24 hours
194
-
195
- ### **Validation Rules**
196
-
197
- - Model name must follow `organization/model` format
198
- - All performance scores must be between 0.0 and 1.0
199
- - All quality metrics must be between 0 and 10
200
- - Pass@1 ≤ Pass@5 ≤ Pass@10 (logical consistency)
201
-
202
- ## ✅ After Submission
203
-
204
- ### **Immediate Feedback**
205
-
206
- You'll see one of these messages:
207
-
208
- #### **Success ✅**
209
-
210
- ```
211
- ✅ Submission recorded successfully!
212
- ```
213
-
214
- #### **Error Examples ❌**
215
-
216
- ```
217
- ❌ Rate limit exceeded: 5/5 submissions in 24 hours
218
- ❌ Model name contains invalid characters
219
- ❌ Pass@1 score cannot be higher than Pass@5
220
- ❌ Score BLEU out of range: 1.2 (must be between 0 and 1)
221
- ```
222
-
223
- ### **View Your Results**
224
-
225
- - Your model will appear in the **🏆 Leaderboard** tab
226
- - Use filters to find your specific submission
227
- - Check the **📈 Analytics** tab for submission history
228
-
229
- ## 🎯 Tips for Better Submissions
230
-
231
- ### **Model Naming**
232
-
233
- ```
234
- ✅ Good: "microsoft/CodeT5-base"
235
- ✅ Good: "facebook/bart-large"
236
- ✅ Good: "my-org/custom-model-v2"
237
- ❌ Bad: "my model"
238
- ❌ Bad: "model@v1.0"
239
- ```
240
-
241
- ### **Performance Scores**
242
-
243
- - Be honest and accurate with your evaluations
244
- - Use proper evaluation methodology
245
- - Ensure Pass@k scores are logically consistent
246
- - Document your evaluation process
247
-
248
- ### **Quality Metrics**
249
-
250
- - Rate based on actual model performance
251
- - Consider multiple test cases
252
- - Be objective in your assessment
253
- - Document your rating criteria
254
-
255
- ## 🤝 Need Help?
256
-
257
- If you encounter issues:
258
-
259
- 1. Check the error message for specific guidance
260
- 2. Verify all fields are filled correctly
261
- 3. Ensure you haven't exceeded rate limits
262
- 4. Contact maintainers if problems persist
263
-
264
- ---
265
-
266
- **Ready to submit your model? Head to the 📝 Submit Model tab and follow this guide!** 🚀
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py CHANGED
@@ -1,20 +1,8 @@
1
- """
2
- CodeReview Leaderboard - Inspired by CodeReviewBench
3
- A comprehensive leaderboard for code review generation models
4
- """
5
-
6
- import os
7
- import json
8
- import tempfile
9
- import logging
10
  import gradio as gr
 
11
  import pandas as pd
12
- import plotly.express as px
13
- import plotly.graph_objects as go
14
  from apscheduler.schedulers.background import BackgroundScheduler
15
- import numpy as np
16
- from gradio.themes.utils import fonts, colors
17
- from dataclasses import fields, dataclass
18
 
19
  from src.about import (
20
  CITATION_BUTTON_LABEL,
@@ -26,1091 +14,191 @@ from src.about import (
26
  )
27
  from src.display.css_html_js import custom_css
28
  from src.display.utils import (
29
- CODEREVIEW_COLUMN,
30
- DISPLAY_COLS,
31
- METRIC_COLS,
32
- HIDDEN_COLS,
33
- NEVER_HIDDEN_COLS,
34
- CATEGORIES,
35
- COMMENT_LANGUAGES,
36
- EXAMPLE_CATEGORIES,
37
- TOPICS,
38
  ModelType,
39
- Mode,
40
- Precision,
41
  WeightType,
42
- ReviewModelType,
43
- get_all_column_choices,
44
- get_default_visible_columns,
45
- )
46
- from src.display.formatting import styled_message, styled_error, styled_warning
47
- from src.envs import (
48
- ADMIN_USERNAME,
49
- ADMIN_PASSWORD,
50
- RESULTS_DATASET_ID,
51
- SUBMITTER_TOKEN,
52
- TOKEN,
53
- DATA_PATH,
54
- )
55
- from src.populate import get_leaderboard_df, get_category_leaderboard_df
56
- from src.submission.submit import process_submission
57
-
58
- # Configure logging
59
- logging.basicConfig(
60
- level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
61
  )
62
- logger = logging.getLogger(__name__)
 
 
63
 
64
- # Ensure data directory exists
65
- os.makedirs(DATA_PATH, exist_ok=True)
66
 
67
- # Available benchmark versions
68
- BENCHMARK_VERSIONS = ["v0"]
69
- CURRENT_VERSION = "v0"
70
 
71
- # Initialize leaderboard data
72
  try:
73
- logger.info("Initializing leaderboard data...")
74
- LEADERBOARD_DF = get_leaderboard_df(version=CURRENT_VERSION)
75
- logger.info(f"Loaded leaderboard with {len(LEADERBOARD_DF)} entries")
76
- except Exception as e:
77
- logger.error(f"Error loading leaderboard data: {e}")
78
- LEADERBOARD_DF = pd.DataFrame()
79
-
80
- custom_theme = gr.themes.Default(
81
- primary_hue=colors.slate,
82
- secondary_hue=colors.slate,
83
- neutral_hue=colors.neutral,
84
- font=(fonts.GoogleFont("Inter"), "sans-serif"),
85
- ).set(
86
- # font_size="16px",
87
- body_background_fill="#0f0f10",
88
- body_background_fill_dark="#0f0f10",
89
- body_text_color="#f4f4f5",
90
- body_text_color_subdued="#a1a1aa",
91
- block_background_fill="#1e1e1e", # Cooler Grey
92
- block_border_color="#333333", # Cooler Grey
93
- block_shadow="none",
94
- # Swapped primary and secondary button styles
95
- button_primary_background_fill="#121212", # Changed to specific color for Refresh button
96
- button_primary_text_color="#f4f4f5",
97
- button_primary_border_color="#333333", # Keep border grey or change to #121212?
98
- button_secondary_background_fill="#f4f4f5",
99
- button_secondary_text_color="#0f0f10",
100
- button_secondary_border_color="#f4f4f5",
101
- input_background_fill="#1e1e1e", # Cooler Grey
102
- input_border_color="#333333", # Cooler Grey
103
- input_placeholder_color="#71717a",
104
- table_border_color="#333333", # Cooler Grey
105
- table_even_background_fill="#2d2d2d", # Cooler Grey (Slightly lighter)
106
- table_odd_background_fill="#1e1e1e", # Cooler Grey
107
- table_text_color="#f4f4f5",
108
- link_text_color="#ffffff",
109
- border_color_primary="#333333", # Cooler Grey
110
- background_fill_secondary="#333333", # Cooler Grey
111
- color_accent="#f4f4f5",
112
- border_color_accent="#333333", # Cooler Grey
113
- button_primary_background_fill_hover="#424242", # Cooler Grey
114
- block_title_text_color="#f4f4f5",
115
- accordion_text_color="#f4f4f5",
116
- panel_background_fill="#1e1e1e", # Cooler Grey
117
- panel_border_color="#333333", # Cooler Grey
118
- # Explicitly setting primary/secondary/accent colors/borders
119
- background_fill_primary="#0f0f10",
120
- background_fill_primary_dark="#0f0f10",
121
- background_fill_secondary_dark="#333333", # Cooler Grey
122
- border_color_primary_dark="#333333", # Cooler Grey
123
- border_color_accent_dark="#333333", # Cooler Grey
124
- border_color_accent_subdued="#424242", # Cooler Grey
125
- border_color_accent_subdued_dark="#424242", # Cooler Grey
126
- color_accent_soft="#a1a1aa",
127
- color_accent_soft_dark="#a1a1aa",
128
- # Explicitly setting input hover/focus states
129
- input_background_fill_dark="#1e1e1e", # Cooler Grey
130
- input_background_fill_focus="#424242", # Cooler Grey
131
- input_background_fill_focus_dark="#424242", # Cooler Grey
132
- input_background_fill_hover="#2d2d2d", # Cooler Grey
133
- input_background_fill_hover_dark="#2d2d2d", # Cooler Grey
134
- input_border_color_dark="#333333", # Cooler Grey
135
- input_border_color_focus="#f4f4f5",
136
- input_border_color_focus_dark="#f4f4f5",
137
- input_border_color_hover="#424242", # Cooler Grey
138
- input_border_color_hover_dark="#424242", # Cooler Grey
139
- input_placeholder_color_dark="#71717a",
140
- # Explicitly set dark variants for table backgrounds
141
- table_even_background_fill_dark="#2d2d2d", # Cooler Grey
142
- table_odd_background_fill_dark="#1e1e1e", # Cooler Grey
143
- # Explicitly set dark text variants
144
- body_text_color_dark="#f4f4f5",
145
- body_text_color_subdued_dark="#a1a1aa",
146
- block_title_text_color_dark="#f4f4f5",
147
- accordion_text_color_dark="#f4f4f5",
148
- table_text_color_dark="#f4f4f5",
149
- # Explicitly set dark panel/block variants
150
- panel_background_fill_dark="#1e1e1e", # Cooler Grey
151
- panel_border_color_dark="#333333", # Cooler Grey
152
- block_background_fill_dark="#1e1e1e", # Cooler Grey
153
- block_border_color_dark="#333333", # Cooler Grey
154
- )
155
-
156
-
157
- @dataclass
158
- class ColumnInfo:
159
- """Information about a column in the leaderboard."""
160
-
161
- name: str
162
- display_name: str
163
- type: str = "text"
164
- hidden: bool = False
165
- never_hidden: bool = False
166
- displayed_by_default: bool = True
167
-
168
-
169
- def update_column_choices(df):
170
- """Update column choices based on what's actually in the dataframe"""
171
- if df is None or df.empty:
172
- return get_all_column_choices()
173
-
174
- # Get columns that actually exist in the dataframe
175
- existing_columns = list(df.columns)
176
-
177
- # Get all possible columns with their display names
178
- all_columns = get_all_column_choices()
179
-
180
- # Filter to only include columns that exist in the dataframe
181
- valid_columns = [
182
- (col_name, display_name)
183
- for col_name, display_name in all_columns
184
- if col_name in existing_columns
185
- ]
186
-
187
- # Return default if there are no valid columns
188
- if not valid_columns:
189
- return get_all_column_choices()
190
-
191
- return valid_columns
192
-
193
-
194
- # Update the column_selector initialization
195
- def get_initial_columns():
196
- """Get initial columns to show in the dropdown"""
197
- try:
198
- # Get available columns in the main dataframe
199
- available_cols = list(LEADERBOARD_DF.columns)
200
- logger.info(f"Available columns in LEADERBOARD_DF: {available_cols}")
201
-
202
- # If dataframe is empty, use default visible columns
203
- if not available_cols:
204
- return get_default_visible_columns()
205
-
206
- # Get default visible columns that actually exist in the dataframe
207
- valid_defaults = [
208
- col for col in get_default_visible_columns() if col in available_cols
209
- ]
210
-
211
- # If none of the defaults exist, return all available columns
212
- if not valid_defaults:
213
- return available_cols
214
-
215
- return valid_defaults
216
- except Exception as e:
217
- logger.error(f"Error getting initial columns: {e}")
218
- return get_default_visible_columns()
219
-
220
-
221
- def init_leaderboard(dataframe, visible_columns=None):
222
- """
223
- Initialize a standard Gradio Dataframe component for the leaderboard.
224
- """
225
- if dataframe is None or dataframe.empty:
226
- # Create an empty dataframe with the right columns
227
- columns = [getattr(CODEREVIEW_COLUMN, col).name for col in DISPLAY_COLS]
228
- dataframe = pd.DataFrame(columns=columns)
229
- logger.warning("Initializing empty leaderboard")
230
-
231
- # Lowercase model_name for display
232
- if "model_name" in dataframe.columns:
233
- dataframe = dataframe.copy()
234
- dataframe["model_name"] = dataframe["model_name"].str.lower()
235
-
236
- if "model_type" in dataframe.columns:
237
- dataframe = dataframe.copy()
238
- dataframe["model_type"] = dataframe["model_type"].str.replace(" : ", "-")
239
-
240
- if "review_model_type" in dataframe.columns:
241
- dataframe = dataframe.copy()
242
- dataframe["review_model_type"] = dataframe["review_model_type"].str.replace("custom", "custom")
243
-
244
- # print("\n\n", "dataframe", dataframe, "--------------------------------\n\n")
245
-
246
- # Determine which columns to display
247
- display_column_names = [
248
- getattr(CODEREVIEW_COLUMN, col).name for col in DISPLAY_COLS
249
- ]
250
- hidden_column_names = [getattr(CODEREVIEW_COLUMN, col).name for col in HIDDEN_COLS]
251
-
252
- # Columns that should always be shown
253
- always_visible = [getattr(CODEREVIEW_COLUMN, col).name for col in NEVER_HIDDEN_COLS]
254
-
255
- # Use provided visible columns if specified, otherwise use default
256
- if visible_columns is None:
257
- # Determine which columns to show initially
258
- visible_columns = [
259
- col for col in display_column_names if col not in hidden_column_names
260
- ]
261
-
262
- # Always include the never-hidden columns
263
- for col in always_visible:
264
- if col not in visible_columns and col in dataframe.columns:
265
- visible_columns.append(col)
266
-
267
- # Make sure we only include columns that actually exist in the dataframe
268
- visible_columns = [col for col in visible_columns if col in dataframe.columns]
269
-
270
- # Map GuardBench column types to Gradio's expected datatype strings
271
- # Valid Gradio datatypes are: 'str', 'number', 'bool', 'date', 'markdown', 'html', 'image'
272
- type_mapping = {
273
- "text": "str",
274
- "number": "number",
275
- "bool": "bool",
276
- "date": "date",
277
- "markdown": "markdown",
278
- "html": "html",
279
- "image": "image",
280
- }
281
-
282
- # Create a list of datatypes in the format Gradio expects
283
- datatypes = []
284
- for col in visible_columns:
285
- # Find the corresponding CODEREVIEW_COLUMN entry
286
- col_type = None
287
- for display_col in DISPLAY_COLS:
288
- if getattr(CODEREVIEW_COLUMN, display_col).name == col:
289
- orig_type = getattr(CODEREVIEW_COLUMN, display_col).type
290
- # Map to Gradio's expected types
291
- col_type = type_mapping.get(orig_type, "str")
292
- break
293
-
294
- # Default to 'str' if type not found or not mappable
295
- if col_type is None:
296
- col_type = "str"
297
-
298
- datatypes.append(col_type)
299
-
300
- # Create a dummy column for search functionality if it doesn't exist
301
- if "search_dummy" not in dataframe.columns:
302
- dataframe["search_dummy"] = dataframe.apply(
303
- lambda row: " ".join(str(val) for val in row.values if pd.notna(val)),
304
- axis=1,
305
- )
306
-
307
- # Select only the visible columns for display
308
- visible_columns.remove("model_name")
309
-
310
- visible_columns = ["model_name"] + visible_columns
311
- display_df = dataframe[visible_columns].copy()
312
-
313
- # print(f"--- DataFrame inside init_leaderboard (before rounding) ---")
314
- # print(display_df[['model_name', 'macro_accuracy', 'macro_recall', 'total_evals_count']].head() if all(c in display_df.columns for c in ['model_name', 'macro_accuracy', 'macro_recall', 'total_evals_count']) else "Relevant columns not present")
315
- # print(f"-------------------------------------------------------------")
316
-
317
- # Round numeric columns to 3 decimal places for display
318
- numeric_cols = display_df.select_dtypes(include=np.number).columns
319
- for col in numeric_cols:
320
- # Avoid rounding integer columns like counts
321
- if not pd.api.types.is_integer_dtype(display_df[col]):
322
- # Format floats to exactly 3 decimal places, preserving trailing zeros
323
- display_df[col] = display_df[col].apply(
324
- lambda x: f"{x:.3f}" if pd.notna(x) else None
325
- )
326
-
327
- column_info_map = {
328
- f.name: getattr(CODEREVIEW_COLUMN, f.name) for f in fields(CODEREVIEW_COLUMN)
329
- }
330
- column_mapping = {
331
- col: column_info_map.get(col, ColumnInfo(col, col)).display_name
332
- for col in visible_columns
333
- }
334
-
335
- # Rename columns in the DataFrame
336
- display_df.rename(columns=column_mapping, inplace=True)
337
-
338
- # Apply styling - note: styling might need adjustment if it relies on column names
339
- styler = display_df.style.set_properties(**{"text-align": "right"}).set_properties(
340
- subset=["Model"], **{"width": "200px"}
341
  )
342
-
343
- return gr.Dataframe(
344
- value=styler,
345
- datatype=datatypes,
346
- interactive=False,
347
- wrap=True,
348
- height=2500,
349
- elem_id="leaderboard-table",
350
- row_count=len(display_df),
351
  )
 
 
352
 
353
 
354
- def search_filter_leaderboard(
355
- df, search_query="", comment_languages=None, version=CURRENT_VERSION
356
- ):
357
- """
358
- Filter the leaderboard based on search query and comment languages.
359
- """
360
- if df is None or df.empty:
361
- return df
362
-
363
- filtered_df = df.copy()
364
-
365
- # Add search dummy column if it doesn't exist
366
- if "search_dummy" not in filtered_df.columns:
367
- filtered_df["search_dummy"] = filtered_df.apply(
368
- lambda row: " ".join(str(val) for val in row.values if pd.notna(val)),
369
- axis=1,
370
- )
371
-
372
- # Apply comment language filter (assuming there's a comment_language column in the data)
373
- if comment_languages and len(comment_languages) > 0:
374
- # Look for a comment language column in the dataframe
375
- comment_lang_cols = [col for col in filtered_df.columns if 'comment_language' in col.lower()]
376
- if comment_lang_cols:
377
- filtered_df = filtered_df[
378
- filtered_df[comment_lang_cols[0]].isin(comment_languages)
379
- ]
380
-
381
- # Apply search query
382
- if search_query:
383
- search_terms = [
384
- term.strip() for term in search_query.split(";") if term.strip()
385
- ]
386
- if search_terms:
387
- combined_mask = None
388
- for term in search_terms:
389
- mask = filtered_df["search_dummy"].str.contains(
390
- term, case=False, na=False
391
- )
392
- if combined_mask is None:
393
- combined_mask = mask
394
- else:
395
- combined_mask = combined_mask | mask
396
-
397
- if combined_mask is not None:
398
- filtered_df = filtered_df[combined_mask]
399
-
400
- # Drop the search dummy column before returning
401
- visible_columns = [col for col in filtered_df.columns if col != "search_dummy"]
402
- return filtered_df[visible_columns]
403
-
404
-
405
- def refresh_data_with_filters(
406
- version=CURRENT_VERSION, search_query="", comment_languages=None, selected_columns=None
407
- ):
408
- """
409
- Refresh the leaderboard data and update all components with filtering.
410
- Ensures we handle cases where dataframes might have limited columns.
411
- """
412
- global LEADERBOARD_DF
413
- try:
414
- logger.info(f"Performing refresh of leaderboard data with filters...")
415
- # Get new data
416
- main_df = get_leaderboard_df(version=version)
417
- LEADERBOARD_DF = main_df
418
- category_dfs = [
419
- get_category_leaderboard_df(category, version=version)
420
- for category in CATEGORIES
421
- ]
422
- selected_columns = [
423
- x.lower()
424
- .replace(" ", "_")
425
- .replace("(", "")
426
- .replace(")", "")
427
- .replace("_recall", "_recall_binary")
428
- .replace("_precision", "_precision_binary")
429
- for x in selected_columns
430
- ]
431
-
432
- # Log the actual columns we have
433
- logger.info(f"Main dataframe columns: {list(main_df.columns)}")
434
-
435
- # Apply filters to each dataframe
436
- filtered_main_df = search_filter_leaderboard(
437
- main_df, search_query, comment_languages, version
438
- )
439
- filtered_category_dfs = [
440
- search_filter_leaderboard(df, search_query, comment_languages, version)
441
- for df in category_dfs
442
- ]
443
-
444
- # Get available columns from the dataframe
445
- available_columns = list(filtered_main_df.columns)
446
-
447
- # Filter selected columns to only those available in the data
448
- if selected_columns:
449
- # Convert display names to internal names first
450
- internal_selected_columns = [
451
- x.lower()
452
- .replace(" ", "_")
453
- .replace("(", "")
454
- .replace(")", "")
455
- .replace("_recall", "_recall_binary")
456
- .replace("_precision", "_precision_binary")
457
- for x in selected_columns
458
- ]
459
- valid_selected_columns = [
460
- col for col in internal_selected_columns if col in available_columns
461
- ]
462
- if not valid_selected_columns and "model_name" in available_columns:
463
- # Fallback if conversion/filtering leads to empty selection
464
- valid_selected_columns = ["model_name"] + [
465
- col
466
- for col in get_default_visible_columns()
467
- if col in available_columns
468
- ]
469
- else:
470
- # If no columns were selected in the dropdown, use default visible columns that exist
471
- valid_selected_columns = [
472
- col for col in get_default_visible_columns() if col in available_columns
473
- ]
474
-
475
- # Initialize dataframes for display with valid selected columns
476
- main_dataframe = init_leaderboard(filtered_main_df, valid_selected_columns)
477
-
478
- # For category dataframes, get columns that actually exist in each one
479
- category_dataframes = []
480
- for df in filtered_category_dfs:
481
- df_columns = list(df.columns)
482
- df_valid_columns = [
483
- col for col in valid_selected_columns if col in df_columns
484
- ]
485
- if not df_valid_columns and "model_name" in df_columns:
486
- df_valid_columns = ["model_name"] + get_default_visible_columns()
487
- category_dataframes.append(init_leaderboard(df, df_valid_columns))
488
-
489
- return main_dataframe, *category_dataframes
490
-
491
- except Exception as e:
492
- logger.error(f"Error in refresh with filters: {e}")
493
- # Return the current leaderboards on error
494
- return leaderboard, *[
495
- tab.children[0] for tab in category_tabs.children[1 : len(CATEGORIES) + 1]
496
- ]
497
-
498
-
499
- def submit_results(
500
- model_name: str,
501
- base_model: str,
502
- revision: str,
503
- precision: str,
504
- weight_type: str,
505
- model_type: str,
506
- mode: str,
507
- submission_file: tempfile._TemporaryFileWrapper,
508
- version: str,
509
- review_model_type: ReviewModelType,
510
- programming_language: str,
511
- comment_language: str,
512
- ):
513
- """
514
- Handle submission of results with model metadata.
515
- """
516
- if submission_file is None:
517
- return styled_error("No submission file provided")
518
-
519
- if not model_name:
520
- return styled_error("Model name is required")
521
-
522
- if not model_type:
523
- return styled_error("Please select a model type")
524
-
525
- if not mode:
526
- return styled_error("Please select an inference mode")
527
-
528
- file_path = submission_file.name
529
- logger.info(f"Received submission for model {model_name}: {file_path}")
530
-
531
- # Add metadata to the submission
532
- metadata = {
533
- "model_name": model_name,
534
- "base_model": base_model,
535
- "revision": revision if revision else "main",
536
- "precision": precision,
537
- "weight_type": weight_type,
538
- "model_type": model_type,
539
- "mode": mode,
540
- "version": version,
541
- "review_model_type": review_model_type,
542
- "programming_language": programming_language,
543
- "comment_language": comment_language,
544
- }
545
-
546
- # Process the submission
547
- result = process_submission(file_path, metadata, version=version)
548
-
549
- # Refresh the leaderboard data
550
- global LEADERBOARD_DF
551
- try:
552
- logger.info(
553
- f"Refreshing leaderboard data after submission for version {version}..."
554
- )
555
- LEADERBOARD_DF = get_leaderboard_df(version=version)
556
- logger.info("Refreshed leaderboard data after submission")
557
- except Exception as e:
558
- logger.error(f"Error refreshing leaderboard data: {e}")
559
-
560
- return result
561
-
562
-
563
- def refresh_data(version=CURRENT_VERSION):
564
- """
565
- Refresh the leaderboard data and update all components.
566
- """
567
- try:
568
- logger.info(f"Performing scheduled refresh of leaderboard data...")
569
- # Get new data
570
- main_df = get_leaderboard_df(version=version)
571
- category_dfs = [
572
- get_category_leaderboard_df(category, version=version)
573
- for category in CATEGORIES
574
- ]
575
-
576
- # For gr.Dataframe, we return the actual dataframes
577
- return main_df, *category_dfs
578
-
579
- except Exception as e:
580
- logger.error(f"Error in scheduled refresh: {e}")
581
- return None, *[None for _ in CATEGORIES]
582
-
583
 
584
- def update_leaderboards(version):
585
- """
586
- Update all leaderboard components with data for the selected version.
587
- """
588
- try:
589
- new_df = get_leaderboard_df(version=version)
590
- category_dfs = [
591
- get_category_leaderboard_df(category, version=version)
592
- for category in CATEGORIES
593
- ]
594
- return new_df, *category_dfs
595
- except Exception as e:
596
- logger.error(f"Error updating leaderboards for version {version}: {e}")
597
- return None, *[None for _ in CATEGORIES]
598
 
599
-
600
- def create_performance_plot(
601
- selected_models, category, metric="f1_binary", version=CURRENT_VERSION
602
- ):
603
- """
604
- Create a radar plot comparing model performance for selected models.
605
- """
606
- if category == "All Results":
607
- df = get_leaderboard_df(version=version)
608
- else:
609
- df = get_category_leaderboard_df(category, version=version)
610
-
611
- if df.empty:
612
- return go.Figure()
613
-
614
- # Lowercase model_name in df and selected_models
615
- df = df.copy()
616
- df["model_name"] = df["model_name"].str.lower()
617
- selected_models = [m.lower() for m in selected_models]
618
- df = df[df["model_name"].isin(selected_models)]
619
- metric_cols = [col for col in df.columns if metric in col]
620
- fig = go.Figure()
621
- colors = ["#8FCCCC", "#C2A4B6", "#98B4A6", "#B68F7C"]
622
- for idx, model in enumerate(selected_models):
623
- model_data = df[df["model_name"] == model]
624
- if not model_data.empty:
625
- values = model_data[metric_cols].values[0].tolist()
626
- values = values + [values[0]]
627
- categories = [col.replace(f"_{metric}", "") for col in metric_cols]
628
- # Replace 'jailbreaked' with 'jailbroken' in categories
629
- categories = [cat.replace('jailbreaked', 'jailbroken') for cat in categories]
630
- categories = categories + [categories[0]]
631
- fig.add_trace(
632
- go.Scatterpolar(
633
- r=values,
634
- theta=categories,
635
- name=model,
636
- line_color=colors[idx % len(colors)],
637
- fill="toself",
638
- )
639
- )
640
- fig.update_layout(
641
- paper_bgcolor="#000000",
642
- plot_bgcolor="#000000",
643
- font={"color": "#ffffff"},
644
- title={
645
- "text": f"{category} - {metric.upper()} Score Comparison",
646
- "font": {"color": "#ffffff", "size": 24},
647
- },
648
- polar=dict(
649
- bgcolor="#000000",
650
- radialaxis=dict(
651
- visible=True,
652
- range=[0, 1],
653
- gridcolor="#333333",
654
- linecolor="#333333",
655
- tickfont={"color": "#ffffff"},
656
  ),
657
- angularaxis=dict(
658
- gridcolor="#333333",
659
- linecolor="#333333",
660
- tickfont={"color": "#ffffff"},
661
  ),
662
- ),
663
- height=600,
664
- showlegend=True,
665
- legend=dict(
666
- yanchor="top",
667
- y=0.99,
668
- xanchor="right",
669
- x=0.99,
670
- bgcolor="rgba(0,0,0,0.5)",
671
- font={"color": "#ffffff"},
672
- ),
673
- )
674
- return fig
675
-
676
-
677
- def update_model_choices(version):
678
- """
679
- Update the list of available models for the given version.
680
- """
681
- df = get_leaderboard_df(version=version)
682
- if df.empty:
683
- return []
684
- return sorted(df["model_name"].str.lower().unique().tolist())
685
-
686
-
687
- def update_visualization(selected_models, selected_category, selected_metric, version):
688
- """
689
- Update the visualization based on user selections.
690
- """
691
- if not selected_models:
692
- return go.Figure()
693
- return create_performance_plot(
694
- selected_models, selected_category, selected_metric, version
695
  )
696
 
697
 
698
- # Create Gradio app
699
- demo = gr.Blocks(css=custom_css, theme=custom_theme)
700
-
701
- CATEGORY_DISPLAY_MAP = {
702
- "Python": "Python",
703
- "Java": "Java",
704
- "Scala": "Scala",
705
- "Go": "Go"
706
- }
707
- # Create reverse mapping for lookups
708
- CATEGORY_REVERSE_MAP = {v: k for k, v in CATEGORY_DISPLAY_MAP.items()}
709
-
710
  with demo:
711
  gr.HTML(TITLE)
712
- # gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
713
- gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
714
 
715
- with gr.Row():
716
- tabs = gr.Tabs(elem_classes="tab-buttons")
 
717
 
718
- with tabs:
719
- with gr.TabItem("Leaderboard", elem_id="codereview-leaderboard-tab", id=0):
720
- with gr.Row():
721
- version_selector = gr.Dropdown(
722
- choices=BENCHMARK_VERSIONS,
723
- label="Benchmark Version",
724
- value=CURRENT_VERSION,
725
- interactive=True,
726
- elem_classes="version-selector",
727
- scale=1,
728
- visible=False,
729
- )
730
 
 
 
731
  with gr.Row():
732
- search_input = gr.Textbox(
733
- placeholder="Search by models (use ; to split)",
734
- label="Search",
735
- elem_id="search-bar",
736
- scale=2,
737
- )
738
- comment_language_filter = gr.Dropdown(
739
- choices=["en", "ru"],
740
- label="Comment Language",
741
- multiselect=True,
742
- value=[],
743
- interactive=True,
744
- scale=1,
745
- )
746
- programming_language_filter = gr.Dropdown(
747
- choices=["Python", "Java", "Scala", "Go"],
748
- label="Programming Language",
749
- multiselect=True,
750
- value=[],
751
- interactive=True,
752
- scale=1,
753
- )
754
- with gr.Row():
755
- topic_filter = gr.Dropdown(
756
- choices=TOPICS,
757
- label="Topic",
758
- multiselect=True,
759
- value=[],
760
- interactive=True,
761
- scale=2,
762
- )
763
- column_selector = gr.Dropdown(
764
- choices=get_all_column_choices(),
765
- label="Columns",
766
- multiselect=True,
767
- value=get_initial_columns(),
768
- interactive=True,
769
- visible=False,
770
- scale=1,
771
- )
772
- with gr.Row():
773
- refresh_button = gr.Button(
774
- "Refresh", scale=0, elem_id="refresh-button"
775
- )
776
-
777
- # Create tabs for each category
778
- with gr.Tabs(elem_classes="category-tabs") as category_tabs:
779
- # First tab for average metrics across all categories
780
- with gr.TabItem("All Results", elem_id="overall-tab"):
781
- leaderboard = init_leaderboard(LEADERBOARD_DF)
782
-
783
- # Create a tab for each category using display names
784
- for category in CATEGORIES:
785
- display_name = CATEGORY_DISPLAY_MAP.get(category, category)
786
- elem_id = f"category-{display_name.lower().replace(' ', '-').replace('&', 'and')}-tab"
787
- with gr.TabItem(display_name, elem_id=elem_id):
788
- category_df = get_category_leaderboard_df(
789
- category, version=CURRENT_VERSION
790
- )
791
- category_leaderboard = init_leaderboard(category_df)
792
-
793
- # Connect search and filter inputs to update function
794
- def update_with_search_filters(
795
- version=CURRENT_VERSION,
796
- search_query="",
797
- comment_languages=None,
798
- selected_columns=None,
799
- ):
800
- """
801
- Update the leaderboards with search and filter settings.
802
- """
803
- return refresh_data_with_filters(
804
- version, search_query, comment_languages, selected_columns
805
- )
806
-
807
- # Refresh button functionality
808
- def refresh_and_update(
809
- version, search_query, comment_languages, selected_columns
810
- ):
811
- """
812
- Refresh data, update LEADERBOARD_DF, and return updated components.
813
- """
814
- global LEADERBOARD_DF
815
- main_df = get_leaderboard_df(version=version)
816
- LEADERBOARD_DF = main_df # Update the global DataFrame
817
- return refresh_data_with_filters(
818
- version, search_query, comment_languages, selected_columns
819
- )
820
-
821
- refresh_button.click(
822
- fn=refresh_and_update,
823
- inputs=[
824
- version_selector,
825
- search_input,
826
- comment_language_filter,
827
- column_selector,
828
- ],
829
- outputs=[leaderboard]
830
- + [
831
- category_tabs.children[i].children[0]
832
- for i in range(1, len(CATEGORIES) + 1)
833
- ],
834
- )
835
- # Search input functionality
836
- search_input.change(
837
- fn=refresh_data_with_filters,
838
- inputs=[
839
- version_selector,
840
- search_input,
841
- comment_language_filter,
842
- column_selector,
843
- ],
844
- outputs=[leaderboard]
845
- + [
846
- category_tabs.children[i].children[0]
847
- for i in range(1, len(CATEGORIES) + 1)
848
- ],
849
- )
850
-
851
- # Comment language filter functionality
852
- comment_language_filter.change(
853
- fn=refresh_data_with_filters,
854
- inputs=[
855
- version_selector,
856
- search_input,
857
- comment_language_filter,
858
- column_selector,
859
- ],
860
- outputs=[leaderboard]
861
- + [
862
- category_tabs.children[i].children[0]
863
- for i in range(1, len(CATEGORIES) + 1)
864
- ],
865
- )
866
-
867
- # Version selector functionality
868
- version_selector.change(
869
- fn=refresh_data_with_filters,
870
- inputs=[
871
- version_selector,
872
- search_input,
873
- comment_language_filter,
874
- column_selector,
875
- ],
876
- outputs=[leaderboard]
877
- + [
878
- category_tabs.children[i].children[0]
879
- for i in range(1, len(CATEGORIES) + 1)
880
- ],
881
- )
882
-
883
- # Update the update_columns function to handle updating all tabs at once
884
- def update_columns(selected_columns):
885
- """
886
- Update all leaderboards to show the selected columns.
887
- Ensures all selected columns are preserved in the update.
888
-
889
- """
890
-
891
- try:
892
- logger.info(f"Updating columns to show: {selected_columns}")
893
-
894
- # If no columns are selected, use default visible columns
895
- if not selected_columns or len(selected_columns) == 0:
896
- selected_columns = get_default_visible_columns()
897
- logger.info(
898
- f"No columns selected, using defaults: {selected_columns}"
899
  )
900
-
901
- # Convert display names to internal names
902
- internal_selected_columns = [
903
- x.lower()
904
- .replace(" ", "_")
905
- .replace("(", "")
906
- .replace(")", "")
907
- .replace("_recall", "_recall_binary")
908
- .replace("_precision", "_precision_binary")
909
- for x in selected_columns
910
- ]
911
-
912
- # Get the current data with ALL columns preserved
913
- main_df = get_leaderboard_df(version=version_selector.value)
914
-
915
- # Get category dataframes with ALL columns preserved
916
- category_dfs = [
917
- get_category_leaderboard_df(
918
- category, version=version_selector.value
919
  )
920
- for category in CATEGORIES
921
- ]
922
 
923
- # Log columns for debugging
924
- logger.info(f"Main dataframe columns: {list(main_df.columns)}")
925
- logger.info(
926
- f"Selected columns (internal): {internal_selected_columns}"
927
- )
928
-
929
- # IMPORTANT: Make sure model_name is always included
930
- if (
931
- "model_name" in main_df.columns
932
- and "model_name" not in internal_selected_columns
933
- ):
934
- internal_selected_columns = [
935
- "model_name"
936
- ] + internal_selected_columns
937
-
938
- # Initialize the main leaderboard with the selected columns
939
- # We're passing the internal_selected_columns directly to preserve the selection
940
- main_leaderboard = init_leaderboard(
941
- main_df, internal_selected_columns
942
- )
943
-
944
- # Initialize category dataframes with the same selected columns
945
- # This ensures consistency across all tabs
946
- category_leaderboards = []
947
- for df in category_dfs:
948
- # Use the same selected columns for each category
949
- # init_leaderboard will automatically handle filtering to columns that exist
950
- category_leaderboards.append(
951
- init_leaderboard(df, internal_selected_columns)
952
  )
953
-
954
- return main_leaderboard, *category_leaderboards
955
-
956
- except Exception as e:
957
- logger.error(f"Error updating columns: {e}")
958
- import traceback
959
-
960
- logger.error(traceback.format_exc())
961
- return leaderboard, *[
962
- tab.children[0]
963
- for tab in category_tabs.children[1 : len(CATEGORIES) + 1]
964
- ]
965
-
966
- # Connect column selector to update function
967
- column_selector.change(
968
- fn=update_columns,
969
- inputs=[column_selector],
970
- outputs=[leaderboard]
971
- + [
972
- category_tabs.children[i].children[0]
973
- for i in range(1, len(CATEGORIES) + 1)
974
- ],
975
- )
976
-
977
- # with gr.TabItem("About", elem_id="codereview-about-tab", id=2):
978
- # gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
979
-
980
- with gr.TabItem("Submit", elem_id="codereview-submit-tab", id=1):
981
- gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
982
-
983
- with gr.Row():
984
- # with gr.Column(scale=3):
985
- # gr.Markdown("# ✉️✨ Submit your results here!", elem_classes="markdown-text")
986
- with gr.Column(scale=1):
987
- # Add version selector specifically for the submission tab
988
- submission_version_selector = gr.Dropdown(
989
- choices=BENCHMARK_VERSIONS,
990
- label="Benchmark Version",
991
- value=CURRENT_VERSION,
992
- interactive=True,
993
- elem_classes="version-selector",
994
- visible=False,
995
- )
996
-
997
- with gr.Row():
998
- with gr.Column():
999
- model_name_textbox = gr.Textbox(label="Model name")
1000
- mode_selector = gr.Dropdown(
1001
- choices=[m.name for m in Mode],
1002
- label="Mode",
1003
- multiselect=False,
1004
- value=None,
1005
- interactive=True,
1006
- )
1007
- revision_name_textbox = gr.Textbox(
1008
- label="Revision commit", placeholder="main"
1009
- )
1010
- model_type = gr.Dropdown(
1011
- choices=[
1012
- t.to_str("-")
1013
- for t in ModelType
1014
- if t != ModelType.Unknown and t != ModelType.ClosedSource
1015
- ],
1016
- label="Model type",
1017
- multiselect=False,
1018
- value=None,
1019
- interactive=True,
1020
- )
1021
- review_model_type = gr.Dropdown(
1022
- choices=[t.name for t in ReviewModelType],
1023
- label="Review model type",
1024
- multiselect=False,
1025
- value=ReviewModelType.CUSTOM.name,
1026
- interactive=True,
1027
- )
1028
- programming_language_selector = gr.Dropdown(
1029
- choices=["Python", "Java", "Scala", "Go"],
1030
- label="Programming Language",
1031
- multiselect=False,
1032
- value=None,
1033
- interactive=True,
1034
- )
1035
- comment_language_selector = gr.Dropdown(
1036
- choices=["en", "ru"],
1037
- label="Comment Language",
1038
- multiselect=False,
1039
- value="en",
1040
- interactive=True,
1041
- )
1042
-
1043
- with gr.Column():
1044
- precision = gr.Dropdown(
1045
- choices=[
1046
- i.name for i in Precision if i != Precision.Unknown
1047
- ],
1048
- label="Precision",
1049
- multiselect=False,
1050
- value="float16",
1051
- interactive=True,
1052
- )
1053
- weight_type = gr.Dropdown(
1054
- choices=[i.name for i in WeightType],
1055
- label="Weights type",
1056
- multiselect=False,
1057
- value="Original",
1058
- interactive=True,
1059
- )
1060
- base_model_name_textbox = gr.Textbox(
1061
- label="Base model (for delta or adapter weights)"
1062
- )
1063
-
1064
- with gr.Row():
1065
- file_input = gr.File(
1066
- label="Upload JSONL Results File", file_types=[".jsonl"]
1067
  )
1068
 
1069
- submit_button = gr.Button("Submit Results")
1070
- result_output = gr.Markdown()
1071
-
1072
- submit_button.click(
1073
- fn=submit_results,
1074
- inputs=[
1075
- model_name_textbox,
1076
- base_model_name_textbox,
1077
- revision_name_textbox,
1078
- precision,
1079
- weight_type,
1080
- model_type,
1081
- mode_selector,
1082
- file_input,
1083
- submission_version_selector,
1084
- review_model_type,
1085
- programming_language_selector,
1086
- comment_language_selector,
1087
- ],
1088
- outputs=result_output,
1089
- )
1090
-
1091
- # Version selector functionality
1092
- version_selector.change(
1093
- fn=update_leaderboards,
1094
- inputs=[version_selector],
1095
- outputs=[leaderboard]
1096
- + [
1097
- category_tabs.children[i].children[0] for i in range(1, len(CATEGORIES) + 1)
1098
- ],
1099
- ).then(
1100
- lambda version: refresh_data_with_filters(version),
1101
- inputs=[version_selector],
1102
- outputs=[leaderboard]
1103
- + [
1104
- category_tabs.children[i].children[0] for i in range(1, len(CATEGORIES) + 1)
1105
- ],
1106
- )
1107
 
 
 
 
 
 
 
 
 
 
1108
 
1109
- # Set up the scheduler to refresh data periodically
1110
  scheduler = BackgroundScheduler()
1111
- scheduler.add_job(refresh_data, "interval", minutes=30)
1112
  scheduler.start()
1113
-
1114
- # Launch the app
1115
- if __name__ == "__main__":
1116
- demo.launch()
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
3
  import pandas as pd
 
 
4
  from apscheduler.schedulers.background import BackgroundScheduler
5
+ from huggingface_hub import snapshot_download
 
 
6
 
7
  from src.about import (
8
  CITATION_BUTTON_LABEL,
 
14
  )
15
  from src.display.css_html_js import custom_css
16
  from src.display.utils import (
17
+ BENCHMARK_COLS,
18
+ COLS,
19
+ EVAL_COLS,
20
+ EVAL_TYPES,
21
+ AutoEvalColumn,
 
 
 
 
22
  ModelType,
23
+ fields,
 
24
  WeightType,
25
+ Precision
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  )
27
+ from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
28
+ from src.populate import get_evaluation_queue_df, get_leaderboard_df
29
+ from src.submission.submit import add_new_eval
30
 
 
 
31
 
32
+ def restart_space():
33
+ API.restart_space(repo_id=REPO_ID)
 
34
 
35
+ ### Space initialisation
36
  try:
37
+ print(EVAL_REQUESTS_PATH)
38
+ snapshot_download(
39
+ repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  )
41
+ except Exception:
42
+ restart_space()
43
+ try:
44
+ print(EVAL_RESULTS_PATH)
45
+ snapshot_download(
46
+ repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
 
 
 
47
  )
48
+ except Exception:
49
+ restart_space()
50
 
51
 
52
+ LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
+ (
55
+ finished_eval_queue_df,
56
+ running_eval_queue_df,
57
+ pending_eval_queue_df,
58
+ ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
 
 
 
 
 
 
 
 
 
59
 
60
+ def init_leaderboard(dataframe):
61
+ if dataframe is None or dataframe.empty:
62
+ raise ValueError("Leaderboard DataFrame is empty or None.")
63
+ return Leaderboard(
64
+ value=dataframe,
65
+ datatype=[c.type for c in fields(AutoEvalColumn)],
66
+ select_columns=SelectColumns(
67
+ default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
68
+ cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
69
+ label="Select Columns to Display:",
70
+ ),
71
+ search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
72
+ hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
73
+ filter_columns=[
74
+ ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
75
+ ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
76
+ ColumnFilter(
77
+ AutoEvalColumn.params.name,
78
+ type="slider",
79
+ min=0.01,
80
+ max=150,
81
+ label="Select the number of parameters (B)",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  ),
83
+ ColumnFilter(
84
+ AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
 
 
85
  ),
86
+ ],
87
+ bool_checkboxgroup_label="Hide models",
88
+ interactive=False,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  )
90
 
91
 
92
+ demo = gr.Blocks(css=custom_css)
 
 
 
 
 
 
 
 
 
 
 
93
  with demo:
94
  gr.HTML(TITLE)
95
+ gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
 
96
 
97
+ with gr.Tabs(elem_classes="tab-buttons") as tabs:
98
+ with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
99
+ leaderboard = init_leaderboard(LEADERBOARD_DF)
100
 
101
+ with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
102
+ gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
 
 
 
 
 
 
 
 
 
 
103
 
104
+ with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
105
+ with gr.Column():
106
  with gr.Row():
107
+ gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
108
+
109
+ with gr.Column():
110
+ with gr.Accordion(
111
+ f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
112
+ open=False,
113
+ ):
114
+ with gr.Row():
115
+ finished_eval_table = gr.components.Dataframe(
116
+ value=finished_eval_queue_df,
117
+ headers=EVAL_COLS,
118
+ datatype=EVAL_TYPES,
119
+ row_count=5,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  )
121
+ with gr.Accordion(
122
+ f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
123
+ open=False,
124
+ ):
125
+ with gr.Row():
126
+ running_eval_table = gr.components.Dataframe(
127
+ value=running_eval_queue_df,
128
+ headers=EVAL_COLS,
129
+ datatype=EVAL_TYPES,
130
+ row_count=5,
 
 
 
 
 
 
 
 
 
131
  )
 
 
132
 
133
+ with gr.Accordion(
134
+ f" Pending Evaluation Queue ({len(pending_eval_queue_df)})",
135
+ open=False,
136
+ ):
137
+ with gr.Row():
138
+ pending_eval_table = gr.components.Dataframe(
139
+ value=pending_eval_queue_df,
140
+ headers=EVAL_COLS,
141
+ datatype=EVAL_TYPES,
142
+ row_count=5,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
  )
144
+ with gr.Row():
145
+ gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
146
+
147
+ with gr.Row():
148
+ with gr.Column():
149
+ model_name_textbox = gr.Textbox(label="Model name")
150
+ revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
151
+ model_type = gr.Dropdown(
152
+ choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
153
+ label="Model type",
154
+ multiselect=False,
155
+ value=None,
156
+ interactive=True,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
  )
158
 
159
+ with gr.Column():
160
+ precision = gr.Dropdown(
161
+ choices=[i.value.name for i in Precision if i != Precision.Unknown],
162
+ label="Precision",
163
+ multiselect=False,
164
+ value="float16",
165
+ interactive=True,
166
+ )
167
+ weight_type = gr.Dropdown(
168
+ choices=[i.value.name for i in WeightType],
169
+ label="Weights type",
170
+ multiselect=False,
171
+ value="Original",
172
+ interactive=True,
173
+ )
174
+ base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
175
+
176
+ submit_button = gr.Button("Submit Eval")
177
+ submission_result = gr.Markdown()
178
+ submit_button.click(
179
+ add_new_eval,
180
+ [
181
+ model_name_textbox,
182
+ base_model_name_textbox,
183
+ revision_name_textbox,
184
+ precision,
185
+ weight_type,
186
+ model_type,
187
+ ],
188
+ submission_result,
189
+ )
 
 
 
 
 
 
 
190
 
191
+ with gr.Row():
192
+ with gr.Accordion("📙 Citation", open=False):
193
+ citation_button = gr.Textbox(
194
+ value=CITATION_BUTTON_TEXT,
195
+ label=CITATION_BUTTON_LABEL,
196
+ lines=20,
197
+ elem_id="citation-button",
198
+ show_copy_button=True,
199
+ )
200
 
 
201
  scheduler = BackgroundScheduler()
202
+ scheduler.add_job(restart_space, "interval", seconds=1800)
203
  scheduler.start()
204
+ demo.queue(default_concurrency_limit=40).launch()
 
 
 
data/.gitkeep DELETED
@@ -1 +0,0 @@
1
- # Keep this directory in git
 
 
data/leaderboard_data.json DELETED
@@ -1,30 +0,0 @@
1
- {
2
- "leaderboard": [
3
- {
4
- "model_name": "example/model",
5
- "programming_language": "Python",
6
- "comment_language": "English",
7
- "taxonomy_category": "Bug Detection",
8
- "bleu": 0.5,
9
- "llm_pass_1": 0.5,
10
- "llm_pass_5": 0.5,
11
- "llm_pass_10": 0.5,
12
- "metrics": {
13
- "readability": 5,
14
- "relevance": 5,
15
- "explanation_clarity": 5,
16
- "problem_identification": 5,
17
- "actionability": 5,
18
- "completeness": 5,
19
- "specificity": 5,
20
- "contextual_adequacy": 5,
21
- "consistency": 5,
22
- "brevity": 5
23
- },
24
- "submission_ip": "127.0.0.1",
25
- "submission_date": "2024-01-01T00:00:00Z"
26
- }
27
- ],
28
- "last_updated": "2025-07-03T13:10:47.434623+00:00",
29
- "total_entries": 1
30
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/submissions.json DELETED
@@ -1,5 +0,0 @@
1
- {
2
- "submissions": [],
3
- "last_updated": "2025-07-03T13:10:47.435548+00:00",
4
- "total_submissions": 0
5
- }
 
 
 
 
 
 
example_submission.jsonl DELETED
@@ -1,4 +0,0 @@
1
- {"model_name": "GPT-4-CodeReview", "programming_language": "Python", "comment_language": "en", "topic": "Code Reliability", "observation_id": "obs_001", "code_snippet": "def calculate_sum(a, b):\n return a + b", "review_text": "This function is simple and correct, but consider adding type hints and docstring for better documentation.", "readability": 8.5, "relevance": 9.0, "explanation_clarity": 7.8, "problem_identification": 8.2, "actionability": 8.7, "completeness": 8.0, "specificity": 7.5, "contextual_adequacy": 8.3, "consistency": 8.8, "brevity": 7.2, "pass_at_1": 0.75, "pass_at_5": 0.88, "pass_at_10": 0.92, "bleu_at_10": 0.65, "total_evaluations": 100}
2
- {"model_name": "GPT-4-CodeReview", "programming_language": "Java", "comment_language": "en", "topic": "Coding Standards", "observation_id": "obs_002", "code_snippet": "public class Calculator {\n public int add(int a, int b) {\n return a + b;\n }\n}", "review_text": "Consider following Java naming conventions and adding JavaDoc comments. The method is functionally correct.", "readability": 8.2, "relevance": 8.8, "explanation_clarity": 7.5, "problem_identification": 8.0, "actionability": 8.5, "completeness": 7.8, "specificity": 7.2, "contextual_adequacy": 8.1, "consistency": 8.6, "brevity": 7.0, "pass_at_1": 0.72, "pass_at_5": 0.85, "pass_at_10": 0.90, "bleu_at_10": 0.62, "total_evaluations": 100}
3
- {"model_name": "Claude-3-CodeReview", "programming_language": "Scala", "comment_language": "ru", "topic": "Performance Issues", "observation_id": "obs_003", "code_snippet": "def fibonacci(n: Int): Int = {\n if (n <= 1) n\n else fibonacci(n-1) + fibonacci(n-2)\n}", "review_text": "Эта реализация неэффективна из-за экспоненциальной сложности. Рекомендуется использовать мемоизацию или итеративный подход.", "readability": 8.8, "relevance": 8.5, "explanation_clarity": 8.2, "problem_identification": 9.2, "actionability": 8.3, "completeness": 8.5, "specificity": 8.0, "contextual_adequacy": 8.6, "consistency": 8.2, "brevity": 8.8, "pass_at_1": 0.78, "pass_at_5": 0.89, "pass_at_10": 0.93, "bleu_at_10": 0.68, "total_evaluations": 100}
4
- {"model_name": "Llama-CodeReview", "programming_language": "Go", "comment_language": "en", "topic": "Variables", "observation_id": "obs_004", "code_snippet": "package main\n\nimport \"fmt\"\n\nfunc main() {\n var x int = 5\n var y int = 10\n fmt.Println(x + y)\n}", "review_text": "Consider using short variable declarations (:=) for local variables. Also, the variable names could be more descriptive.", "readability": 7.5, "relevance": 7.8, "explanation_clarity": 7.0, "problem_identification": 7.5, "actionability": 7.2, "completeness": 7.8, "specificity": 6.8, "contextual_adequacy": 7.3, "consistency": 7.6, "brevity": 6.5, "pass_at_1": 0.65, "pass_at_5": 0.78, "pass_at_10": 0.85, "bleu_at_10": 0.55, "total_evaluations": 100}
 
 
 
 
 
gradio_test.ipynb DELETED
@@ -1,32 +0,0 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "code",
5
- "execution_count": null,
6
- "metadata": {},
7
- "outputs": [],
8
- "source": []
9
- }
10
- ],
11
- "metadata": {
12
- "kernelspec": {
13
- "display_name": "agent_env",
14
- "language": "python",
15
- "name": "python3"
16
- },
17
- "language_info": {
18
- "codemirror_mode": {
19
- "name": "ipython",
20
- "version": 3
21
- },
22
- "file_extension": ".py",
23
- "mimetype": "text/x-python",
24
- "name": "python",
25
- "nbconvert_exporter": "python",
26
- "pygments_lexer": "ipython3",
27
- "version": "3.13.2"
28
- }
29
- },
30
- "nbformat": 4,
31
- "nbformat_minor": 2
32
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
leaderboard_data.json DELETED
@@ -1,32 +0,0 @@
1
- {
2
- "entries": [
3
- {
4
- "model_name": "GPT-4-CodeReview",
5
- "model_type": "LLM",
6
- "mode": "Strict",
7
- "review_model_type": "gpt-4",
8
- "programming_language": "Python",
9
- "comment_language": "en",
10
- "topic": "Code Reliability",
11
- "submission_date": "2024-10-06T12:00:00Z",
12
- "version": "v0",
13
- "readability": 8.5,
14
- "relevance": 9.0,
15
- "explanation_clarity": 7.8,
16
- "problem_identification": 8.2,
17
- "actionability": 8.7,
18
- "completeness": 8.0,
19
- "specificity": 7.5,
20
- "contextual_adequacy": 8.3,
21
- "consistency": 8.8,
22
- "brevity": 7.2,
23
- "pass_at_1": 0.75,
24
- "pass_at_5": 0.88,
25
- "pass_at_10": 0.92,
26
- "bleu_at_10": 0.65,
27
- "total_evaluations": 100
28
- }
29
- ],
30
- "last_updated": "2024-10-06T12:00:00Z",
31
- "version": "v0"
32
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -1,8 +1,16 @@
1
- gradio==4.44.1
2
- pandas>=2.0.0
3
- huggingface_hub>=0.20.0
4
- datasets>=2.0.0
5
- apscheduler>=3.10.0
6
- python-dotenv>=1.0.0
7
- plotly>=5.18.0
8
- pydantic==2.10.6
 
 
 
 
 
 
 
 
 
1
+ APScheduler
2
+ black
3
+ datasets
4
+ gradio
5
+ gradio[oauth]
6
+ gradio_leaderboard==0.0.13
7
+ gradio_client
8
+ huggingface-hub>=0.18.0
9
+ matplotlib
10
+ numpy
11
+ pandas
12
+ python-dateutil
13
+ tqdm
14
+ transformers
15
+ tokenizers>=0.15.0
16
+ sentencepiece
src/__init__.py DELETED
@@ -1 +0,0 @@
1
- # CodeReview Leaderboard - Source Module
 
 
src/about.py CHANGED
@@ -1,59 +1,72 @@
1
- """
2
- Text content for the CodeReview Bench Leaderboard.
3
- """
4
 
5
- TITLE = """
6
- <div style="text-align: center; margin-bottom: 1rem">
7
- <h1>CodeReview Bench Leaderboard</h1>
8
- </div>
9
- """
10
 
11
- INTRODUCTION_TEXT = """
12
- ## Introduction
13
 
14
- CodeReview Bench is a comprehensive benchmark for evaluating the quality and effectiveness of automated code review systems.
15
- This leaderboard tracks model performance across various programming languages and review criteria,
16
- including readability, relevance, explanation clarity, and actionability.
 
 
 
 
 
 
17
 
18
- Models are evaluated on their ability to provide high-quality code reviews that are helpful,
19
- accurate, and actionable across multiple programming languages and review categories.
20
- """
21
 
22
- LLM_BENCHMARKS_TEXT = """
23
- CodeReview Bench is a comprehensive benchmark for evaluating automated code review systems across programming languages and review quality dimensions.
24
 
25
- It evaluates models on their ability to provide high-quality code reviews using both LLM-based multimetric evaluation (readability, relevance, explanation clarity, problem identification, actionability, completeness, specificity, contextual adequacy, consistency, brevity) and exact-match metrics (pass@1, pass@5, pass@10) presented in our paper.
 
26
 
27
- The benchmark supports both Russian and English comment languages across 4 programming languages including Python, Java, Go, Scala
 
 
 
 
 
 
 
 
 
 
28
 
29
  """
30
 
31
  EVALUATION_QUEUE_TEXT = """
32
- ## Submit Your Model
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
- To add your model to the CodeReview Bench leaderboard:
 
35
 
36
- 1. Run your evaluation using the CodeReview Bench framework
37
- 2. Upload your results in .jsonl format using this form.
38
- 3. Once validated, your model will appear on the leaderboard.
39
 
40
- ### Requirements:
41
- - Results must include all required metrics: LLM-based multimetric scores and exact-match metrics
42
- - Submissions should cover multiple programming languages where applicable
43
- - Both Russian and English comment languages are supported
44
 
45
- ### ✉️✨ Ready? Upload your results below!
 
 
 
46
  """
47
 
48
- CITATION_BUTTON_LABEL = "Cite CodeReview Bench"
49
-
50
- CITATION_BUTTON_TEXT = """
51
- @misc{codereviewbench2025,
52
- author = {CodeReview Bench Team},
53
- title = {CodeReview Bench: Comprehensive Benchmark for Automated Code Review Systems},
54
- year = {2025},
55
- publisher = {GitHub},
56
- journal = {GitHub repository},
57
- howpublished = {\\url{https://github.com/your-org/codereview-bench}}
58
- }
59
  """
 
1
+ from dataclasses import dataclass
2
+ from enum import Enum
 
3
 
4
+ @dataclass
5
+ class Task:
6
+ benchmark: str
7
+ metric: str
8
+ col_name: str
9
 
 
 
10
 
11
+ # Select your tasks here
12
+ # ---------------------------------------------------
13
+ class Tasks(Enum):
14
+ # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
+ task0 = Task("anli_r1", "acc", "ANLI")
16
+ task1 = Task("logiqa", "acc_norm", "LogiQA")
17
+
18
+ NUM_FEWSHOT = 0 # Change with your few shot
19
+ # ---------------------------------------------------
20
 
 
 
 
21
 
 
 
22
 
23
+ # Your leaderboard name
24
+ TITLE = """<h1 align="center" id="space-title">Demo leaderboard</h1>"""
25
 
26
+ # What does your leaderboard evaluate?
27
+ INTRODUCTION_TEXT = """
28
+ Intro text
29
+ """
30
+
31
+ # Which evaluations are you running? how can people reproduce what you have?
32
+ LLM_BENCHMARKS_TEXT = f"""
33
+ ## How it works
34
+
35
+ ## Reproducibility
36
+ To reproduce our results, here is the commands you can run:
37
 
38
  """
39
 
40
  EVALUATION_QUEUE_TEXT = """
41
+ ## Some good practices before submitting a model
42
+
43
+ ### 1) Make sure you can load your model and tokenizer using AutoClasses:
44
+ ```python
45
+ from transformers import AutoConfig, AutoModel, AutoTokenizer
46
+ config = AutoConfig.from_pretrained("your model name", revision=revision)
47
+ model = AutoModel.from_pretrained("your model name", revision=revision)
48
+ tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
49
+ ```
50
+ If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
51
+
52
+ Note: make sure your model is public!
53
+ Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
54
 
55
+ ### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
56
+ It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
57
 
58
+ ### 3) Make sure your model has an open license!
59
+ This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
 
60
 
61
+ ### 4) Fill up your model card
62
+ When we add extra information about models to the leaderboard, it will be automatically taken from the model card
 
 
63
 
64
+ ## In case of model failure
65
+ If your model is displayed in the `FAILED` category, its execution stopped.
66
+ Make sure you have followed the above steps first.
67
+ If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
68
  """
69
 
70
+ CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
71
+ CITATION_BUTTON_TEXT = r"""
 
 
 
 
 
 
 
 
 
72
  """
src/display/__init__.py DELETED
@@ -1 +0,0 @@
1
- # Display utilities module
 
 
src/display/css_html_js.py CHANGED
@@ -1,97 +1,105 @@
1
- """
2
- CSS and styling for the CodeReview Bench Leaderboard.
3
- """
4
-
5
  custom_css = """
 
6
  .markdown-text {
7
  font-size: 16px !important;
8
- text-align: justify !important;
9
- line-height: 1.0 !important;
10
- margin-top: 10px !important;
11
- margin-bottom: 10px !important;
12
  }
13
 
14
- .tab-buttons button.selected {
15
- border-color: #f4f4f5 !important;
16
- background: #3f3f46 !important;
17
- color: #f4f4f5 !important;
18
  }
19
 
20
- #citation-button textarea {
21
- font-family: monospace !important;
22
  }
23
 
24
- .leaderboard-container {
25
- margin-top: 20px;
26
  }
27
 
28
- .category-header {
29
- font-weight: bold;
30
- background-color: #f5f5f5;
31
- padding: 10px;
32
- margin-top: 15px;
33
- border-radius: 5px;
34
  }
35
 
36
- .metric-name {
37
- font-weight: bold;
38
- color: #a1a1aa !important;
39
  }
40
 
41
- .model-name {
42
- font-weight: bold;
43
  }
44
 
45
- .model-link:hover {
46
- text-decoration: underline;
47
- color: #ffffff !important;
48
  }
49
-
50
- .version-selector {
51
- margin: 0 !important;
52
- padding: 5px;
53
- border-radius: 5px;
54
  }
55
 
56
- .version-selector label {
57
- font-weight: bold;
58
- color: #f4f4f5 !important;
 
 
 
59
  }
60
 
61
- .version-selector select {
62
- border-color: #3f3f46 !important;
63
- border-radius: 5px;
64
  }
65
 
66
- /* Make sure the version selector is properly aligned with refresh button */
67
- .version-selector > .block {
68
- padding: 0 !important;
 
 
 
 
69
  }
70
 
71
- .version-selector > .block > .wrap {
72
- position: relative;
73
- top: -5px;
74
  }
75
-
76
- /* Force background/border for common layout containers */
77
- .gradio-row > .block,
78
- .gradio-column > .block,
79
- .form,
80
- .panel {
81
- /* background: #18181b !important; */ /* Removed background override */
82
- border-color: #27272a80 !important; /* Made border color semi-transparent */
83
- border-width: 1px !important; /* Ensure border is visible */
84
- border-style: solid !important;
85
  }
86
-
87
- /* Target the specific file upload component area */
88
- .gradio-file .wrap {
89
- /* background: #18181b !important; */ /* Removed background override */
90
- border-color: #27272a !important;
91
  }
92
-
93
- #refresh-button {
94
- margin-top: 5px !important;
95
- margin-bottom: 5px !important;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  }
97
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  custom_css = """
2
+
3
  .markdown-text {
4
  font-size: 16px !important;
 
 
 
 
5
  }
6
 
7
+ #models-to-add-text {
8
+ font-size: 18px !important;
 
 
9
  }
10
 
11
+ #citation-button span {
12
+ font-size: 16px !important;
13
  }
14
 
15
+ #citation-button textarea {
16
+ font-size: 16px !important;
17
  }
18
 
19
+ #citation-button > label > button {
20
+ margin: 6px;
21
+ transform: scale(1.3);
 
 
 
22
  }
23
 
24
+ #leaderboard-table {
25
+ margin-top: 15px
 
26
  }
27
 
28
+ #leaderboard-table-lite {
29
+ margin-top: 15px
30
  }
31
 
32
+ #search-bar-table-box > div:first-child {
33
+ background: none;
34
+ border: none;
35
  }
36
+
37
+ #search-bar {
38
+ padding: 0px;
 
 
39
  }
40
 
41
+ /* Limit the width of the first AutoEvalColumn so that names don't expand too much */
42
+ #leaderboard-table td:nth-child(2),
43
+ #leaderboard-table th:nth-child(2) {
44
+ max-width: 400px;
45
+ overflow: auto;
46
+ white-space: nowrap;
47
  }
48
 
49
+ .tab-buttons button {
50
+ font-size: 20px;
 
51
  }
52
 
53
+ #scale-logo {
54
+ border-style: none !important;
55
+ box-shadow: none;
56
+ display: block;
57
+ margin-left: auto;
58
+ margin-right: auto;
59
+ max-width: 600px;
60
  }
61
 
62
+ #scale-logo .download {
63
+ display: none;
 
64
  }
65
+ #filter_type{
66
+ border: 0;
67
+ padding-left: 0;
68
+ padding-top: 0;
 
 
 
 
 
 
69
  }
70
+ #filter_type label {
71
+ display: flex;
 
 
 
72
  }
73
+ #filter_type label > span{
74
+ margin-top: var(--spacing-lg);
75
+ margin-right: 0.5em;
76
+ }
77
+ #filter_type label > .wrap{
78
+ width: 103px;
79
+ }
80
+ #filter_type label > .wrap .wrap-inner{
81
+ padding: 2px;
82
+ }
83
+ #filter_type label > .wrap .wrap-inner input{
84
+ width: 1px
85
+ }
86
+ #filter-columns-type{
87
+ border:0;
88
+ padding:0.5;
89
+ }
90
+ #filter-columns-size{
91
+ border:0;
92
+ padding:0.5;
93
+ }
94
+ #box-filter > .form{
95
+ border: 0
96
  }
97
  """
98
+
99
+ get_window_url_params = """
100
+ function(url_params) {
101
+ const params = new URLSearchParams(window.location.search);
102
+ url_params = Object.fromEntries(params);
103
+ return url_params;
104
+ }
105
+ """
src/display/formatting.py CHANGED
@@ -1,71 +1,27 @@
1
- """
2
- Formatting utilities for the GuardBench Leaderboard.
3
- """
4
 
5
- import pandas as pd
6
- import numpy as np
7
 
 
 
 
8
 
9
- def make_clickable_model(model_name: str) -> str:
10
- """
11
- Create a clickable link for a model name.
12
- """
13
- return f'<a href="https://huggingface.co/{model_name}" target="_blank">{model_name}</a>'
14
 
 
 
15
 
16
- def has_no_nan_values(df: pd.DataFrame, columns: list) -> pd.Series:
17
- """
18
- Check if a row has no NaN values in the specified columns.
19
- """
20
- return ~df[columns].isna().any(axis=1)
21
 
 
 
22
 
23
- def format_percentage(value: float) -> str:
24
- """
25
- Format a value as a percentage.
26
- """
27
- if pd.isna(value):
28
- return "N/A"
29
- return f"{value * 100:.2f}%"
30
 
 
 
31
 
32
- def format_number(value: float, precision: int = 2) -> str:
33
- """
34
- Format a number with specified precision.
35
- """
36
- if pd.isna(value):
37
- return "N/A"
38
- return f"{value:.{precision}f}"
39
 
 
 
40
 
41
- def styled_message(message: str) -> str:
42
- """
43
- Format a success message with styling.
44
- """
45
- return f"""
46
- <div style="padding: 10px; border-radius: 5px; background-color: #e6f7e6; color: #2e7d32; border: 1px solid #2e7d32;">
47
- ✅ {message}
48
- </div>
49
- """
50
 
51
-
52
- def styled_warning(message: str) -> str:
53
- """
54
- Format a warning message with styling.
55
- """
56
- return f"""
57
- <div style="padding: 10px; border-radius: 5px; background-color: #fff8e1; color: #ff8f00; border: 1px solid #ff8f00;">
58
- ⚠️ {message}
59
- </div>
60
- """
61
-
62
-
63
- def styled_error(message: str) -> str:
64
- """
65
- Format an error message with styling.
66
- """
67
- return f"""
68
- <div style="padding: 10px; border-radius: 5px; background-color: #ffebee; color: #c62828; border: 1px solid #c62828;">
69
- ❌ {message}
70
- </div>
71
- """
 
1
+ def model_hyperlink(link, model_name):
2
+ return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
 
3
 
 
 
4
 
5
+ def make_clickable_model(model_name):
6
+ link = f"https://huggingface.co/{model_name}"
7
+ return model_hyperlink(link, model_name)
8
 
 
 
 
 
 
9
 
10
+ def styled_error(error):
11
+ return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
12
 
 
 
 
 
 
13
 
14
+ def styled_warning(warn):
15
+ return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
16
 
 
 
 
 
 
 
 
17
 
18
+ def styled_message(message):
19
+ return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
20
 
 
 
 
 
 
 
 
21
 
22
+ def has_no_nan_values(df, columns):
23
+ return df[columns].notna().all(axis=1)
24
 
 
 
 
 
 
 
 
 
 
25
 
26
+ def has_nan_values(df, columns):
27
+ return df[columns].isna().any(axis=1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/display/utils.py CHANGED
@@ -1,417 +1,110 @@
1
- """
2
- Utility classes and functions for the CodeReview Bench Leaderboard display.
3
- """
4
 
5
- from dataclasses import dataclass, field, fields
6
- from enum import Enum, auto
7
- from typing import List, Optional
8
 
 
9
 
10
- class Mode(Enum):
11
- """Inference mode for the review model."""
12
- CoT = auto() # Chain of Thought
13
- Strict = auto()
14
-
15
- def __str__(self):
16
- """String representation of the mode."""
17
- return self.name
18
-
19
-
20
- class ModelType(Enum):
21
- """Model types for the leaderboard."""
22
- Unknown = auto()
23
- OpenSource = auto()
24
- ClosedSource = auto()
25
- API = auto()
26
-
27
- def to_str(self, separator: str = "-") -> str:
28
- """Convert enum to string with separator."""
29
- if self == ModelType.Unknown:
30
- return "Unknown"
31
- elif self == ModelType.OpenSource:
32
- return f"Open{separator}Source"
33
- elif self == ModelType.ClosedSource:
34
- return f"Closed{separator}Source"
35
- elif self == ModelType.API:
36
- return "API"
37
- return "Unknown"
38
-
39
-
40
- class ReviewModelType(str, Enum):
41
- """Review model types for the leaderboard."""
42
- GPT_4 = "gpt-4"
43
- GPT_3_5 = "gpt-3.5-turbo"
44
- CLAUDE = "claude"
45
- LLAMA = "llama"
46
- GEMINI = "gemini"
47
- CUSTOM = "custom"
48
-
49
- def __str__(self):
50
- """String representation of the review model type."""
51
- return self.value
52
-
53
-
54
- class Precision(Enum):
55
- """Model precision types."""
56
- Unknown = auto()
57
- float16 = auto()
58
- bfloat16 = auto()
59
- float32 = auto()
60
- int8 = auto()
61
- int4 = auto()
62
- NA = auto()
63
-
64
- def __str__(self):
65
- """String representation of the precision type."""
66
- return self.name
67
-
68
-
69
- class WeightType(Enum):
70
- """Model weight types."""
71
- Original = auto()
72
- Delta = auto()
73
- Adapter = auto()
74
-
75
- def __str__(self):
76
- """String representation of the weight type."""
77
- return self.name
78
 
79
 
 
 
 
80
  @dataclass
81
- class ColumnInfo:
82
- """Information about a column in the leaderboard."""
83
  name: str
84
- display_name: str
85
- type: str = "text"
86
  hidden: bool = False
87
  never_hidden: bool = False
88
- displayed_by_default: bool = True
89
-
90
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  @dataclass
92
- class CodeReviewBenchColumn:
93
- """Columns for the CodeReview Bench leaderboard."""
94
- # Core metadata
95
- model_name: ColumnInfo = field(default_factory=lambda: ColumnInfo(
96
- name="model_name",
97
- display_name="Model",
98
- never_hidden=True,
99
- displayed_by_default=True
100
- ))
101
- mode: ColumnInfo = field(default_factory=lambda: ColumnInfo(
102
- name="mode",
103
- display_name="Mode",
104
- displayed_by_default=True
105
- ))
106
- model_type: ColumnInfo = field(default_factory=lambda: ColumnInfo(
107
- name="model_type",
108
- display_name="Access_Type",
109
- displayed_by_default=True
110
- ))
111
- submission_date: ColumnInfo = field(default_factory=lambda: ColumnInfo(
112
- name="submission_date",
113
- display_name="Submission_Date",
114
- displayed_by_default=False
115
- ))
116
- version: ColumnInfo = field(default_factory=lambda: ColumnInfo(
117
- name="version",
118
- display_name="Version",
119
- displayed_by_default=False
120
- ))
121
- review_model_type: ColumnInfo = field(default_factory=lambda: ColumnInfo(
122
- name="review_model_type",
123
- display_name="Type",
124
- displayed_by_default=False
125
- ))
126
- base_model: ColumnInfo = field(default_factory=lambda: ColumnInfo(
127
- name="base_model",
128
- display_name="Base Model",
129
- displayed_by_default=False
130
- ))
131
- revision: ColumnInfo = field(default_factory=lambda: ColumnInfo(
132
- name="revision",
133
- display_name="Revision",
134
- displayed_by_default=False
135
- ))
136
- precision: ColumnInfo = field(default_factory=lambda: ColumnInfo(
137
- name="precision",
138
- display_name="Precision",
139
- displayed_by_default=False
140
- ))
141
- weight_type: ColumnInfo = field(default_factory=lambda: ColumnInfo(
142
- name="weight_type",
143
- display_name="Weight Type",
144
- displayed_by_default=False
145
- ))
146
- topic: ColumnInfo = field(default_factory=lambda: ColumnInfo(
147
- name="topic",
148
- display_name="Topic",
149
- displayed_by_default=True
150
- ))
151
-
152
- # LLM-based multimetric scores
153
- readability: ColumnInfo = field(default_factory=lambda: ColumnInfo(
154
- name="readability",
155
- display_name="Readability",
156
- type="number",
157
- displayed_by_default=True
158
- ))
159
- relevance: ColumnInfo = field(default_factory=lambda: ColumnInfo(
160
- name="relevance",
161
- display_name="Relevance",
162
- type="number",
163
- displayed_by_default=True
164
- ))
165
- explanation_clarity: ColumnInfo = field(default_factory=lambda: ColumnInfo(
166
- name="explanation_clarity",
167
- display_name="Explanation_Clarity",
168
- type="number",
169
- displayed_by_default=True
170
- ))
171
- problem_identification: ColumnInfo = field(default_factory=lambda: ColumnInfo(
172
- name="problem_identification",
173
- display_name="Problem_Identification",
174
- type="number",
175
- displayed_by_default=True
176
- ))
177
- actionability: ColumnInfo = field(default_factory=lambda: ColumnInfo(
178
- name="actionability",
179
- display_name="Actionability",
180
- type="number",
181
- displayed_by_default=True
182
- ))
183
- completeness: ColumnInfo = field(default_factory=lambda: ColumnInfo(
184
- name="completeness",
185
- display_name="Completeness",
186
- type="number",
187
- displayed_by_default=True
188
- ))
189
- specificity: ColumnInfo = field(default_factory=lambda: ColumnInfo(
190
- name="specificity",
191
- display_name="Specificity",
192
- type="number",
193
- displayed_by_default=True
194
- ))
195
- contextual_adequacy: ColumnInfo = field(default_factory=lambda: ColumnInfo(
196
- name="contextual_adequacy",
197
- display_name="Contextual_Adequacy",
198
- type="number",
199
- displayed_by_default=True
200
- ))
201
- consistency: ColumnInfo = field(default_factory=lambda: ColumnInfo(
202
- name="consistency",
203
- display_name="Consistency",
204
- type="number",
205
- displayed_by_default=True
206
- ))
207
- brevity: ColumnInfo = field(default_factory=lambda: ColumnInfo(
208
- name="brevity",
209
- display_name="Brevity",
210
- type="number",
211
- displayed_by_default=True
212
- ))
213
-
214
- # LLM-based-exact-match metrics
215
- pass_at_1: ColumnInfo = field(default_factory=lambda: ColumnInfo(
216
- name="pass_at_1",
217
- display_name="Pass@1",
218
- type="number",
219
- displayed_by_default=True
220
- ))
221
- pass_at_5: ColumnInfo = field(default_factory=lambda: ColumnInfo(
222
- name="pass_at_5",
223
- display_name="Pass@5",
224
- type="number",
225
- displayed_by_default=True
226
- ))
227
- pass_at_10: ColumnInfo = field(default_factory=lambda: ColumnInfo(
228
- name="pass_at_10",
229
- display_name="Pass@10",
230
- type="number",
231
- displayed_by_default=True
232
- ))
233
- bleu_at_10: ColumnInfo = field(default_factory=lambda: ColumnInfo(
234
- name="bleu_at_10",
235
- display_name="BLEU@10",
236
- type="number",
237
- displayed_by_default=True
238
- ))
239
-
240
- # Overall aggregated metrics
241
- overall_score: ColumnInfo = field(default_factory=lambda: ColumnInfo(
242
- name="overall_score",
243
- display_name="Overall_Score",
244
- type="number",
245
- displayed_by_default=True
246
- ))
247
- multimetric_average: ColumnInfo = field(default_factory=lambda: ColumnInfo(
248
- name="multimetric_average",
249
- display_name="Multimetric_Average",
250
- type="number",
251
- displayed_by_default=True
252
- ))
253
- exact_match_average: ColumnInfo = field(default_factory=lambda: ColumnInfo(
254
- name="exact_match_average",
255
- display_name="Exact_Match_Average",
256
- type="number",
257
- displayed_by_default=True
258
- ))
259
- total_evaluations: ColumnInfo = field(default_factory=lambda: ColumnInfo(
260
- name="total_evaluations",
261
- display_name="Total_Evaluations",
262
- type="number",
263
- displayed_by_default=True
264
- ))
265
-
266
- # Language-specific metrics (Russian)
267
- ru_readability: ColumnInfo = field(default_factory=lambda: ColumnInfo(
268
- name="ru_readability",
269
- display_name="RU_Readability",
270
- type="number",
271
- displayed_by_default=False
272
- ))
273
- ru_relevance: ColumnInfo = field(default_factory=lambda: ColumnInfo(
274
- name="ru_relevance",
275
- display_name="RU_Relevance",
276
- type="number",
277
- displayed_by_default=False
278
- ))
279
- ru_overall_score: ColumnInfo = field(default_factory=lambda: ColumnInfo(
280
- name="ru_overall_score",
281
- display_name="RU_Overall_Score",
282
- type="number",
283
- displayed_by_default=False
284
- ))
285
-
286
- # Language-specific metrics (English)
287
- en_readability: ColumnInfo = field(default_factory=lambda: ColumnInfo(
288
- name="en_readability",
289
- display_name="EN_Readability",
290
- type="number",
291
- displayed_by_default=False
292
- ))
293
- en_relevance: ColumnInfo = field(default_factory=lambda: ColumnInfo(
294
- name="en_relevance",
295
- display_name="EN_Relevance",
296
- type="number",
297
- displayed_by_default=False
298
- ))
299
- en_overall_score: ColumnInfo = field(default_factory=lambda: ColumnInfo(
300
- name="en_overall_score",
301
- display_name="EN_Overall_Score",
302
- type="number",
303
- displayed_by_default=False
304
- ))
305
-
306
-
307
- # Create instances for easy access
308
- CODEREVIEW_COLUMN = CodeReviewBenchColumn()
309
-
310
- # Extract column lists for different views
311
- COLS = [f.name for f in fields(CODEREVIEW_COLUMN)]
312
- DISPLAY_COLS = [getattr(CODEREVIEW_COLUMN, f.name).name for f in fields(CODEREVIEW_COLUMN)
313
- if getattr(CODEREVIEW_COLUMN, f.name).displayed_by_default]
314
-
315
- # Manually reorder DISPLAY_COLS to put 'mode' after 'model_name'
316
- def reorder_display_cols():
317
- cols = DISPLAY_COLS
318
- if 'model_name' in cols and 'mode' in cols:
319
- cols.remove('mode')
320
- model_name_index = cols.index('model_name')
321
- cols.insert(model_name_index + 1, 'mode')
322
- return cols
323
- DISPLAY_COLS = reorder_display_cols()
324
-
325
- METRIC_COLS = [getattr(CODEREVIEW_COLUMN, f.name).name for f in fields(CODEREVIEW_COLUMN)
326
- if getattr(CODEREVIEW_COLUMN, f.name).type == "number"]
327
- HIDDEN_COLS = [getattr(CODEREVIEW_COLUMN, f.name).name for f in fields(CODEREVIEW_COLUMN)
328
- if getattr(CODEREVIEW_COLUMN, f.name).hidden]
329
- NEVER_HIDDEN_COLS = [getattr(CODEREVIEW_COLUMN, f.name).name for f in fields(CODEREVIEW_COLUMN)
330
- if getattr(CODEREVIEW_COLUMN, f.name).never_hidden]
331
-
332
- # Categories for CodeReview Bench (Programming Languages)
333
- CATEGORIES = [
334
- 'Python',
335
- 'Java',
336
- 'Scala',
337
- 'Go'
338
- ]
339
-
340
- # Language taxonomies for CodeReview Bench
341
- COMMENT_LANGUAGES = [
342
- 'ru', # Russian
343
- 'en' # English
344
- ]
345
-
346
- # Topics for CodeReview Bench
347
- TOPICS = [
348
- 'Code Reliability',
349
- 'Coding Standards',
350
- 'Code Organization',
351
- 'Performance Issues',
352
- 'Validation',
353
- 'Variables'
354
- ]
355
-
356
- # Example categories
357
- EXAMPLE_CATEGORIES = [
358
- 'Bug_Fix',
359
- 'Code_Style',
360
- 'Performance',
361
- 'Security',
362
- 'Refactoring',
363
- 'Documentation',
364
- 'Testing',
365
- 'Architecture',
366
- 'Other'
367
- ]
368
 
369
- # Metrics for CodeReview Bench
370
- MULTIMETRIC_METRICS = [
371
- "readability",
372
- "relevance",
373
- "explanation_clarity",
374
- "problem_identification",
375
- "actionability",
376
- "completeness",
377
- "specificity",
378
- "contextual_adequacy",
379
- "consistency",
380
- "brevity"
381
- ]
382
 
383
- EXACT_MATCH_METRICS = [
384
- "pass_at_1",
385
- "pass_at_5",
386
- "pass_at_10",
387
- "bleu_at_10"
388
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
389
 
390
- def get_all_column_choices():
391
- """
392
- Get all available column choices for the multiselect dropdown.
 
393
 
394
- Returns:
395
- List of tuples with (column_name, display_name) for all columns.
396
- """
397
- column_choices = []
398
 
399
- default_visible_columns = get_default_visible_columns()
 
 
 
 
 
400
 
401
- for f in fields(CODEREVIEW_COLUMN):
402
- column_info = getattr(CODEREVIEW_COLUMN, f.name)
403
- # Create a tuple with both the internal name and display name
404
- if column_info.name not in default_visible_columns:
405
- column_choices.append((column_info.name, column_info.display_name))
406
 
407
- return column_choices
 
408
 
409
- def get_default_visible_columns():
410
- """
411
- Get the list of column names that should be visible by default.
412
 
413
- Returns:
414
- List of column names that are displayed by default.
415
- """
416
- return [getattr(CODEREVIEW_COLUMN, f.name).name for f in fields(CODEREVIEW_COLUMN)
417
- if getattr(CODEREVIEW_COLUMN, f.name).displayed_by_default]
 
1
+ from dataclasses import dataclass, make_dataclass
2
+ from enum import Enum
 
3
 
4
+ import pandas as pd
 
 
5
 
6
+ from src.about import Tasks
7
 
8
+ def fields(raw_class):
9
+ return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
 
12
+ # These classes are for user facing column names,
13
+ # to avoid having to change them all around the code
14
+ # when a modif is needed
15
  @dataclass
16
+ class ColumnContent:
 
17
  name: str
18
+ type: str
19
+ displayed_by_default: bool
20
  hidden: bool = False
21
  never_hidden: bool = False
 
 
22
 
23
+ ## Leaderboard columns
24
+ auto_eval_column_dict = []
25
+ # Init
26
+ auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
27
+ auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
28
+ #Scores
29
+ auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
30
+ for task in Tasks:
31
+ auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
32
+ # Model information
33
+ auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
34
+ auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
35
+ auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
36
+ auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
37
+ auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
38
+ auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
39
+ auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
40
+ auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
41
+ auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
42
+
43
+ # We use make dataclass to dynamically fill the scores from Tasks
44
+ AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
45
+
46
+ ## For the queue columns in the submission tab
47
+ @dataclass(frozen=True)
48
+ class EvalQueueColumn: # Queue column
49
+ model = ColumnContent("model", "markdown", True)
50
+ revision = ColumnContent("revision", "str", True)
51
+ private = ColumnContent("private", "bool", True)
52
+ precision = ColumnContent("precision", "str", True)
53
+ weight_type = ColumnContent("weight_type", "str", "Original")
54
+ status = ColumnContent("status", "str", True)
55
+
56
+ ## All the model information that we might need
57
  @dataclass
58
+ class ModelDetails:
59
+ name: str
60
+ display_name: str = ""
61
+ symbol: str = "" # emoji
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
+ class ModelType(Enum):
65
+ PT = ModelDetails(name="pretrained", symbol="🟢")
66
+ FT = ModelDetails(name="fine-tuned", symbol="🔶")
67
+ IFT = ModelDetails(name="instruction-tuned", symbol="⭕")
68
+ RL = ModelDetails(name="RL-tuned", symbol="🟦")
69
+ Unknown = ModelDetails(name="", symbol="?")
70
+
71
+ def to_str(self, separator=" "):
72
+ return f"{self.value.symbol}{separator}{self.value.name}"
73
+
74
+ @staticmethod
75
+ def from_str(type):
76
+ if "fine-tuned" in type or "🔶" in type:
77
+ return ModelType.FT
78
+ if "pretrained" in type or "🟢" in type:
79
+ return ModelType.PT
80
+ if "RL-tuned" in type or "🟦" in type:
81
+ return ModelType.RL
82
+ if "instruction-tuned" in type or "⭕" in type:
83
+ return ModelType.IFT
84
+ return ModelType.Unknown
85
 
86
+ class WeightType(Enum):
87
+ Adapter = ModelDetails("Adapter")
88
+ Original = ModelDetails("Original")
89
+ Delta = ModelDetails("Delta")
90
 
91
+ class Precision(Enum):
92
+ float16 = ModelDetails("float16")
93
+ bfloat16 = ModelDetails("bfloat16")
94
+ Unknown = ModelDetails("?")
95
 
96
+ def from_str(precision):
97
+ if precision in ["torch.float16", "float16"]:
98
+ return Precision.float16
99
+ if precision in ["torch.bfloat16", "bfloat16"]:
100
+ return Precision.bfloat16
101
+ return Precision.Unknown
102
 
103
+ # Column selection
104
+ COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
 
 
 
105
 
106
+ EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
107
+ EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
108
 
109
+ BENCHMARK_COLS = [t.value.col_name for t in Tasks]
 
 
110
 
 
 
 
 
 
src/envs.py CHANGED
@@ -1,27 +1,25 @@
1
  import os
 
2
  from huggingface_hub import HfApi
3
- from dotenv import load_dotenv
4
 
5
- # Load environment variables
6
- load_dotenv()
 
7
 
8
- # Hugging Face configuration
9
- TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
10
- OWNER = os.environ.get("OWNER", "codereview-bench") # Change to your org
11
- SUBMITTER_TOKEN = os.environ.get("SUBMITTER_TOKEN")
12
- ADMIN_USERNAME = os.environ.get("ADMIN_USERNAME")
13
- ADMIN_PASSWORD = os.environ.get("ADMIN_PASSWORD")
14
 
15
- # Repository IDs
16
- REPO_ID = f"{OWNER}/codereview-bench"
17
- RESULTS_DATASET_ID = os.environ.get("RESULTS_DATASET_ID", f"{OWNER}/codereview-bench-results")
18
 
19
- # Cache paths
20
- CACHE_PATH = os.getenv("HF_HOME", ".")
21
- DATA_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "data")
22
 
23
- # Local data paths
24
- LEADERBOARD_FILE = os.path.join(DATA_PATH, "leaderboard.json")
 
 
 
25
 
26
- # HF API instance
27
  API = HfApi(token=TOKEN)
 
1
  import os
2
+
3
  from huggingface_hub import HfApi
 
4
 
5
+ # Info to change for your repository
6
+ # ----------------------------------
7
+ TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
8
 
9
+ OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request dataset, with the correct format!
10
+ # ----------------------------------
 
 
 
 
11
 
12
+ REPO_ID = f"{OWNER}/leaderboard"
13
+ QUEUE_REPO = f"{OWNER}/requests"
14
+ RESULTS_REPO = f"{OWNER}/results"
15
 
16
+ # If you setup a cache later, just change HF_HOME
17
+ CACHE_PATH=os.getenv("HF_HOME", ".")
 
18
 
19
+ # Local caches
20
+ EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
21
+ EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
22
+ EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
23
+ EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
24
 
 
25
  API = HfApi(token=TOKEN)
src/leaderboard/__init__.py DELETED
@@ -1 +0,0 @@
1
- # Leaderboard processing module
 
 
src/leaderboard/processor.py DELETED
@@ -1,271 +0,0 @@
1
- """
2
- Process CodeReview Bench leaderboard data and submissions.
3
- """
4
-
5
- import json
6
- import os
7
- import pandas as pd
8
- from datetime import datetime
9
- from typing import Dict, List, Tuple, Optional
10
- import numpy as np
11
-
12
- from src.display.utils import (
13
- CODEREVIEW_COLUMN, DISPLAY_COLS, CATEGORIES, COMMENT_LANGUAGES, EXAMPLE_CATEGORIES,
14
- MULTIMETRIC_METRICS, EXACT_MATCH_METRICS
15
- )
16
-
17
-
18
- def process_jsonl_submission(file_path: str) -> Tuple[List[Dict], str]:
19
- """
20
- Process a JSONL submission file for CodeReview Bench.
21
-
22
- Args:
23
- file_path: Path to the JSONL submission file
24
-
25
- Returns:
26
- Tuple of (entries_list, message)
27
- """
28
- try:
29
- entries = []
30
- with open(file_path, 'r', encoding='utf-8') as f:
31
- for line_num, line in enumerate(f, 1):
32
- line = line.strip()
33
- if not line:
34
- continue
35
-
36
- try:
37
- entry = json.loads(line)
38
-
39
- # Validate required fields
40
- required_fields = ['model_name', 'programming_language', 'comment_language']
41
- missing_fields = [field for field in required_fields if field not in entry]
42
- if missing_fields:
43
- return [], f"Missing required fields {missing_fields} in line {line_num}"
44
-
45
- # Validate metrics exist
46
- has_multimetric = any(metric in entry for metric in MULTIMETRIC_METRICS)
47
- has_exact_match = any(metric in entry for metric in EXACT_MATCH_METRICS)
48
-
49
- if not has_multimetric and not has_exact_match:
50
- return [], f"No valid metrics found in line {line_num}. Required: {MULTIMETRIC_METRICS + EXACT_MATCH_METRICS}"
51
-
52
- entries.append(entry)
53
-
54
- except json.JSONDecodeError as e:
55
- return [], f"Invalid JSON in line {line_num}: {e}"
56
-
57
- if not entries:
58
- return [], "No valid entries found in submission file"
59
-
60
- return entries, f"Successfully processed {len(entries)} entries"
61
-
62
- except Exception as e:
63
- return [], f"Error processing submission: {e}"
64
-
65
-
66
- def calculate_overall_score(entry: Dict) -> float:
67
- """
68
- Calculate overall score for a CodeReview Bench entry.
69
-
70
- Args:
71
- entry: Dictionary containing model evaluation results
72
-
73
- Returns:
74
- Overall score as float
75
- """
76
- # Calculate multimetric average
77
- multimetric_scores = []
78
- for metric in MULTIMETRIC_METRICS:
79
- if metric in entry and isinstance(entry[metric], (int, float)):
80
- multimetric_scores.append(entry[metric])
81
-
82
- multimetric_avg = np.mean(multimetric_scores) if multimetric_scores else 0
83
-
84
- # Calculate exact match average
85
- exact_match_scores = []
86
- for metric in EXACT_MATCH_METRICS:
87
- if metric in entry and isinstance(entry[metric], (int, float)):
88
- exact_match_scores.append(entry[metric])
89
-
90
- exact_match_avg = np.mean(exact_match_scores) if exact_match_scores else 0
91
-
92
- # Weighted combination (can be adjusted based on requirements)
93
- overall_score = (multimetric_avg * 0.7) + (exact_match_avg * 0.3)
94
-
95
- return overall_score
96
-
97
-
98
- def load_leaderboard_data(file_path: str) -> Dict:
99
- """
100
- Load the leaderboard data from a JSON file.
101
- """
102
- if not os.path.exists(file_path):
103
- version = "v0"
104
- if "_v" in file_path:
105
- version = file_path.split("_")[-1].split(".")[0]
106
- return {"entries": [], "last_updated": datetime.now().isoformat(), "version": version}
107
-
108
- with open(file_path, 'r') as f:
109
- data = json.load(f)
110
-
111
- # Ensure version field exists
112
- if "version" not in data:
113
- version = "v0"
114
- if "_v" in file_path:
115
- version = file_path.split("_")[-1].split(".")[0]
116
- data["version"] = version
117
-
118
- return data
119
-
120
-
121
- def save_leaderboard_data(data: Dict, file_path: str) -> None:
122
- """
123
- Save the leaderboard data to a JSON file.
124
- """
125
- # Ensure the directory exists
126
- os.makedirs(os.path.dirname(file_path), exist_ok=True)
127
-
128
- # Update the last_updated timestamp
129
- data["last_updated"] = datetime.now().isoformat()
130
-
131
- # Ensure version is set
132
- if "version" not in data:
133
- version = "v0"
134
- if "_v" in file_path:
135
- version = file_path.split("_")[-1].split(".")[0]
136
- data["version"] = version
137
-
138
- with open(file_path, 'w') as f:
139
- json.dump(data, f, indent=2)
140
-
141
-
142
- def leaderboard_to_dataframe(leaderboard_data: Dict) -> pd.DataFrame:
143
- """
144
- Convert leaderboard data to a pandas DataFrame for display.
145
- """
146
- rows = []
147
-
148
- for entry in leaderboard_data.get("entries", []):
149
- model_name = entry.get("model_name", "Unknown Model")
150
-
151
- # Extract basic metadata
152
- row = {
153
- "model_name": model_name,
154
- "model_type": entry.get("model_type", "Unknown"),
155
- "mode": entry.get("mode", "Strict"),
156
- "submission_date": entry.get("submission_date", ""),
157
- "version": entry.get("version", "v0"),
158
- "review_model_type": entry.get("review_model_type", "custom").lower()
159
- }
160
-
161
- # Add additional metadata fields if present
162
- for key in ["base_model", "revision", "precision", "weight_type", "topic", "programming_language", "comment_language"]:
163
- if key in entry:
164
- row[key] = entry[key]
165
-
166
- # Add multimetric scores
167
- for metric in MULTIMETRIC_METRICS:
168
- if metric in entry:
169
- row[metric] = entry[metric]
170
- else:
171
- row[metric] = pd.NA
172
-
173
- # Add exact match metrics
174
- for metric in EXACT_MATCH_METRICS:
175
- if metric in entry:
176
- row[metric] = entry[metric]
177
- else:
178
- row[metric] = pd.NA
179
-
180
- # Calculate aggregated metrics
181
- multimetric_scores = [entry.get(metric, 0) for metric in MULTIMETRIC_METRICS if metric in entry and pd.notna(entry[metric])]
182
- exact_match_scores = [entry.get(metric, 0) for metric in EXACT_MATCH_METRICS if metric in entry and pd.notna(entry[metric])]
183
-
184
- if multimetric_scores:
185
- row["multimetric_average"] = np.mean(multimetric_scores)
186
- else:
187
- row["multimetric_average"] = pd.NA
188
-
189
- if exact_match_scores:
190
- row["exact_match_average"] = np.mean(exact_match_scores)
191
- else:
192
- row["exact_match_average"] = pd.NA
193
-
194
- # Calculate overall score
195
- row["overall_score"] = calculate_overall_score(entry)
196
-
197
- # Add language-specific metrics if available
198
- for lang in COMMENT_LANGUAGES:
199
- for metric in ["readability", "relevance", "overall_score"]:
200
- lang_key = f"{lang}_{metric}"
201
- if lang_key in entry:
202
- row[lang_key] = entry[lang_key]
203
- else:
204
- row[lang_key] = pd.NA
205
-
206
- # Add evaluation count
207
- row["total_evaluations"] = entry.get("total_evaluations", entry.get("evaluation_count", pd.NA))
208
-
209
- rows.append(row)
210
-
211
- # Create DataFrame and sort by overall score
212
- df = pd.DataFrame(rows)
213
-
214
- # Ensure all expected columns exist
215
- for metric in MULTIMETRIC_METRICS + EXACT_MATCH_METRICS:
216
- if metric not in df.columns:
217
- df[metric] = pd.NA
218
-
219
- # Sort by overall score (descending)
220
- if not df.empty:
221
- df = df.sort_values(by="overall_score", ascending=False, na_position='last')
222
-
223
- # Ensure summary columns exist
224
- summary_cols = ["overall_score", "multimetric_average", "exact_match_average", "total_evaluations"]
225
- for col in summary_cols:
226
- if col not in df.columns:
227
- df[col] = pd.NA
228
-
229
- return df
230
-
231
-
232
- def add_entries_to_leaderboard(leaderboard_data: Dict, new_entries: List[Dict]) -> Dict:
233
- """
234
- Add new entries to the leaderboard, replacing any with the same model name.
235
- """
236
- # Create a mapping of existing entries by model name and version
237
- existing_entries = {
238
- (entry["model_name"], entry.get("version", "v0")): i
239
- for i, entry in enumerate(leaderboard_data.get("entries", []))
240
- }
241
-
242
- # Process each new entry
243
- for new_entry in new_entries:
244
- model_name = new_entry.get("model_name")
245
- version = new_entry.get("version", "v0")
246
-
247
- # Add calculated metrics
248
- new_entry["overall_score"] = calculate_overall_score(new_entry)
249
-
250
- # Calculate averages
251
- multimetric_scores = [new_entry.get(metric) for metric in MULTIMETRIC_METRICS if metric in new_entry and pd.notna(new_entry[metric])]
252
- exact_match_scores = [new_entry.get(metric) for metric in EXACT_MATCH_METRICS if metric in new_entry and pd.notna(new_entry[metric])]
253
-
254
- if multimetric_scores:
255
- new_entry["multimetric_average"] = np.mean(multimetric_scores)
256
- if exact_match_scores:
257
- new_entry["exact_match_average"] = np.mean(exact_match_scores)
258
-
259
- if (model_name, version) in existing_entries:
260
- # Replace existing entry
261
- leaderboard_data["entries"][existing_entries[(model_name, version)]] = new_entry
262
- else:
263
- # Add new entry
264
- if "entries" not in leaderboard_data:
265
- leaderboard_data["entries"] = []
266
- leaderboard_data["entries"].append(new_entry)
267
-
268
- # Update the last_updated timestamp
269
- leaderboard_data["last_updated"] = datetime.now().isoformat()
270
-
271
- return leaderboard_data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/leaderboard/read_evals.py ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import glob
2
+ import json
3
+ import math
4
+ import os
5
+ from dataclasses import dataclass
6
+
7
+ import dateutil
8
+ import numpy as np
9
+
10
+ from src.display.formatting import make_clickable_model
11
+ from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
12
+ from src.submission.check_validity import is_model_on_hub
13
+
14
+
15
+ @dataclass
16
+ class EvalResult:
17
+ """Represents one full evaluation. Built from a combination of the result and request file for a given run.
18
+ """
19
+ eval_name: str # org_model_precision (uid)
20
+ full_model: str # org/model (path on hub)
21
+ org: str
22
+ model: str
23
+ revision: str # commit hash, "" if main
24
+ results: dict
25
+ precision: Precision = Precision.Unknown
26
+ model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
27
+ weight_type: WeightType = WeightType.Original # Original or Adapter
28
+ architecture: str = "Unknown"
29
+ license: str = "?"
30
+ likes: int = 0
31
+ num_params: int = 0
32
+ date: str = "" # submission date of request file
33
+ still_on_hub: bool = False
34
+
35
+ @classmethod
36
+ def init_from_json_file(self, json_filepath):
37
+ """Inits the result from the specific model result file"""
38
+ with open(json_filepath) as fp:
39
+ data = json.load(fp)
40
+
41
+ config = data.get("config")
42
+
43
+ # Precision
44
+ precision = Precision.from_str(config.get("model_dtype"))
45
+
46
+ # Get model and org
47
+ org_and_model = config.get("model_name", config.get("model_args", None))
48
+ org_and_model = org_and_model.split("/", 1)
49
+
50
+ if len(org_and_model) == 1:
51
+ org = None
52
+ model = org_and_model[0]
53
+ result_key = f"{model}_{precision.value.name}"
54
+ else:
55
+ org = org_and_model[0]
56
+ model = org_and_model[1]
57
+ result_key = f"{org}_{model}_{precision.value.name}"
58
+ full_model = "/".join(org_and_model)
59
+
60
+ still_on_hub, _, model_config = is_model_on_hub(
61
+ full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
62
+ )
63
+ architecture = "?"
64
+ if model_config is not None:
65
+ architectures = getattr(model_config, "architectures", None)
66
+ if architectures:
67
+ architecture = ";".join(architectures)
68
+
69
+ # Extract results available in this file (some results are split in several files)
70
+ results = {}
71
+ for task in Tasks:
72
+ task = task.value
73
+
74
+ # We average all scores of a given metric (not all metrics are present in all files)
75
+ accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
76
+ if accs.size == 0 or any([acc is None for acc in accs]):
77
+ continue
78
+
79
+ mean_acc = np.mean(accs) * 100.0
80
+ results[task.benchmark] = mean_acc
81
+
82
+ return self(
83
+ eval_name=result_key,
84
+ full_model=full_model,
85
+ org=org,
86
+ model=model,
87
+ results=results,
88
+ precision=precision,
89
+ revision= config.get("model_sha", ""),
90
+ still_on_hub=still_on_hub,
91
+ architecture=architecture
92
+ )
93
+
94
+ def update_with_request_file(self, requests_path):
95
+ """Finds the relevant request file for the current model and updates info with it"""
96
+ request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
97
+
98
+ try:
99
+ with open(request_file, "r") as f:
100
+ request = json.load(f)
101
+ self.model_type = ModelType.from_str(request.get("model_type", ""))
102
+ self.weight_type = WeightType[request.get("weight_type", "Original")]
103
+ self.license = request.get("license", "?")
104
+ self.likes = request.get("likes", 0)
105
+ self.num_params = request.get("params", 0)
106
+ self.date = request.get("submitted_time", "")
107
+ except Exception:
108
+ print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
109
+
110
+ def to_dict(self):
111
+ """Converts the Eval Result to a dict compatible with our dataframe display"""
112
+ average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
113
+ data_dict = {
114
+ "eval_name": self.eval_name, # not a column, just a save name,
115
+ AutoEvalColumn.precision.name: self.precision.value.name,
116
+ AutoEvalColumn.model_type.name: self.model_type.value.name,
117
+ AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
118
+ AutoEvalColumn.weight_type.name: self.weight_type.value.name,
119
+ AutoEvalColumn.architecture.name: self.architecture,
120
+ AutoEvalColumn.model.name: make_clickable_model(self.full_model),
121
+ AutoEvalColumn.revision.name: self.revision,
122
+ AutoEvalColumn.average.name: average,
123
+ AutoEvalColumn.license.name: self.license,
124
+ AutoEvalColumn.likes.name: self.likes,
125
+ AutoEvalColumn.params.name: self.num_params,
126
+ AutoEvalColumn.still_on_hub.name: self.still_on_hub,
127
+ }
128
+
129
+ for task in Tasks:
130
+ data_dict[task.value.col_name] = self.results[task.value.benchmark]
131
+
132
+ return data_dict
133
+
134
+
135
+ def get_request_file_for_model(requests_path, model_name, precision):
136
+ """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
137
+ request_files = os.path.join(
138
+ requests_path,
139
+ f"{model_name}_eval_request_*.json",
140
+ )
141
+ request_files = glob.glob(request_files)
142
+
143
+ # Select correct request file (precision)
144
+ request_file = ""
145
+ request_files = sorted(request_files, reverse=True)
146
+ for tmp_request_file in request_files:
147
+ with open(tmp_request_file, "r") as f:
148
+ req_content = json.load(f)
149
+ if (
150
+ req_content["status"] in ["FINISHED"]
151
+ and req_content["precision"] == precision.split(".")[-1]
152
+ ):
153
+ request_file = tmp_request_file
154
+ return request_file
155
+
156
+
157
+ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
158
+ """From the path of the results folder root, extract all needed info for results"""
159
+ model_result_filepaths = []
160
+
161
+ for root, _, files in os.walk(results_path):
162
+ # We should only have json files in model results
163
+ if len(files) == 0 or any([not f.endswith(".json") for f in files]):
164
+ continue
165
+
166
+ # Sort the files by date
167
+ try:
168
+ files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
169
+ except dateutil.parser._parser.ParserError:
170
+ files = [files[-1]]
171
+
172
+ for file in files:
173
+ model_result_filepaths.append(os.path.join(root, file))
174
+
175
+ eval_results = {}
176
+ for model_result_filepath in model_result_filepaths:
177
+ # Creation of result
178
+ eval_result = EvalResult.init_from_json_file(model_result_filepath)
179
+ eval_result.update_with_request_file(requests_path)
180
+
181
+ # Store results of same eval together
182
+ eval_name = eval_result.eval_name
183
+ if eval_name in eval_results.keys():
184
+ eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
185
+ else:
186
+ eval_results[eval_name] = eval_result
187
+
188
+ results = []
189
+ for v in eval_results.values():
190
+ try:
191
+ v.to_dict() # we test if the dict version is complete
192
+ results.append(v)
193
+ except KeyError: # not all eval values present
194
+ continue
195
+
196
+ return results
src/populate.py CHANGED
@@ -1,188 +1,58 @@
1
- """
2
- Populate the CodeReview Bench leaderboard from HuggingFace datasets.
3
- """
4
-
5
  import json
6
  import os
7
- import pandas as pd
8
- import tempfile
9
- from typing import Dict, List, Optional
10
- from datetime import datetime
11
- import numpy as np
12
-
13
- from huggingface_hub import hf_hub_download, HfApi
14
- from datasets import load_dataset
15
-
16
- from src.display.utils import CODEREVIEW_COLUMN, DISPLAY_COLS, CATEGORIES
17
- from src.envs import RESULTS_DATASET_ID, TOKEN, CACHE_PATH
18
- from src.leaderboard.processor import leaderboard_to_dataframe
19
-
20
-
21
- def get_latest_leaderboard(version="v0") -> Optional[Dict]:
22
- """
23
- Get the latest leaderboard data from HuggingFace dataset.
24
- Fallback to local JSON file if HF download fails or is unavailable.
25
- """
26
- # First try to fetch from HuggingFace Hub
27
- try:
28
- leaderboard_path = hf_hub_download(
29
- repo_id=RESULTS_DATASET_ID,
30
- filename=f"leaderboards/leaderboard_{version}.json",
31
- repo_type="dataset",
32
- token=TOKEN
33
- )
34
- with open(leaderboard_path, 'r') as f:
35
- return json.load(f)
36
- except Exception as hf_err:
37
- print(f"HF download failed or unavailable: {hf_err}. Trying local fallback...")
38
-
39
- # Fallback: attempt to load a local leaderboard_data.json located at the project root
40
- project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
41
- local_path_candidates = [
42
- os.path.join(project_root, "leaderboard_data.json"), # legacy path in root
43
- os.path.join(project_root, "data", "leaderboard.json"), # path defined in envs.py
44
- ]
45
-
46
- for local_path in local_path_candidates:
47
- if os.path.exists(local_path):
48
- try:
49
- with open(local_path, 'r') as f:
50
- return json.load(f)
51
- except Exception as local_err:
52
- print(f"Error loading local leaderboard file {local_path}: {local_err}")
53
-
54
- # If nothing found, return None
55
- return None
56
-
57
-
58
- def get_model_entry(model_name: str, mode: str, version="v0") -> Optional[Dict]:
59
- """
60
- Get a specific model's entry from the entries folder, uniquely identified by model_name, mode, and version.
61
- """
62
- try:
63
- model_name_safe = model_name.replace("/", "_").replace(" ", "_")
64
- mode_safe = str(mode).replace("/", "_").replace(" ", "_").lower()
65
- entry_path = hf_hub_download(
66
- repo_id=RESULTS_DATASET_ID,
67
- filename=f"entries/entry_{model_name_safe}_{mode_safe}_{version}.json",
68
- repo_type="dataset",
69
- token=TOKEN
70
- )
71
- with open(entry_path, 'r') as f:
72
- return json.load(f)
73
- except Exception as e:
74
- print(f"Error downloading model entry: {e}")
75
- return None
76
-
77
-
78
- def get_all_entries(version="v0") -> List[Dict]:
79
- """
80
- Get all entries from the HuggingFace dataset.
81
- """
82
- try:
83
- api = HfApi(token=TOKEN)
84
- files = api.list_repo_files(repo_id=RESULTS_DATASET_ID, repo_type="dataset")
85
- entry_files = [f for f in files if f.startswith("entries/") and f.endswith(f"_{version}.json")]
86
-
87
- all_entries = []
88
- for entry_file in entry_files:
89
- try:
90
- entry_path = hf_hub_download(
91
- repo_id=RESULTS_DATASET_ID,
92
- filename=entry_file,
93
- repo_type="dataset",
94
- token=TOKEN
95
- )
96
- with open(entry_path, 'r') as f:
97
- entry_data = json.load(f)
98
- all_entries.append(entry_data)
99
- except Exception as e:
100
- print(f"Error loading entry {entry_file}: {e}")
101
-
102
- return all_entries
103
- except Exception as e:
104
- print(f"Error getting all entries: {e}")
105
- return []
106
-
107
-
108
- def get_leaderboard_df(version="v0") -> pd.DataFrame:
109
- """
110
- Get the leaderboard data as a DataFrame.
111
- """
112
- # Get latest leaderboard data
113
- leaderboard_data = get_latest_leaderboard(version)
114
-
115
- if not leaderboard_data:
116
- # If no leaderboard exists, try to build it from entries
117
- entries = get_all_entries(version)
118
- if entries:
119
- leaderboard_data = {
120
- "entries": entries,
121
- "last_updated": datetime.now().isoformat(),
122
- "version": version
123
- }
124
- else:
125
- # Return empty DataFrame if no data available
126
- return pd.DataFrame(columns=DISPLAY_COLS)
127
-
128
- # Convert to DataFrame
129
- return leaderboard_to_dataframe(leaderboard_data)
130
-
131
-
132
- def get_category_leaderboard_df(category: str, version="v0") -> pd.DataFrame:
133
- """
134
- Get the leaderboard data filtered by a specific programming language category.
135
- """
136
- # Get latest leaderboard data
137
- leaderboard_data = get_latest_leaderboard(version)
138
-
139
- if not leaderboard_data:
140
- # If no leaderboard exists, try to build it from entries
141
- entries = get_all_entries(version)
142
- if entries:
143
- leaderboard_data = {
144
- "entries": entries,
145
- "last_updated": datetime.now().isoformat(),
146
- "version": version
147
- }
148
- else:
149
- # Return empty DataFrame if no data available
150
- return pd.DataFrame(columns=DISPLAY_COLS)
151
-
152
- # Filter entries to only include those with data for the specified programming language
153
- filtered_entries = []
154
- for entry in leaderboard_data.get("entries", []):
155
- # Check if entry has data for this programming language
156
- programming_language = entry.get("programming_language", "").lower()
157
- if programming_language == category.lower() or category.lower() == "other":
158
- # For "other" category, include entries that don't match any specific language
159
- if category.lower() == "other":
160
- if programming_language not in [cat.lower() for cat in CATEGORIES[:-1]]: # Exclude "Other" from check
161
- filtered_entries.append(entry)
162
- else:
163
- filtered_entries.append(entry)
164
-
165
- # Create a new leaderboard data structure with the filtered entries
166
- filtered_leaderboard = {
167
- "entries": filtered_entries,
168
- "last_updated": leaderboard_data.get("last_updated", datetime.now().isoformat()),
169
- "version": version
170
- }
171
-
172
- # Convert to DataFrame
173
- return leaderboard_to_dataframe(filtered_leaderboard)
174
 
 
175
 
176
- def get_detailed_model_data(model_name: str, mode: str, version="v0") -> Dict:
177
- """
178
- Get detailed data for a specific model and mode.
179
- """
180
- entry = get_model_entry(model_name, mode, version)
181
- if entry:
182
- return entry
183
- leaderboard_data = get_latest_leaderboard(version)
184
- if leaderboard_data:
185
- for entry in leaderboard_data.get("entries", []):
186
- if entry.get("model_name") == model_name and str(entry.get("mode")).lower() == str(mode).lower():
187
- return entry
188
- return {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import json
2
  import os
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
+ import pandas as pd
5
 
6
+ from src.display.formatting import has_no_nan_values, make_clickable_model
7
+ from src.display.utils import AutoEvalColumn, EvalQueueColumn
8
+ from src.leaderboard.read_evals import get_raw_eval_results
9
+
10
+
11
+ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
12
+ """Creates a dataframe from all the individual experiment results"""
13
+ raw_data = get_raw_eval_results(results_path, requests_path)
14
+ all_data_json = [v.to_dict() for v in raw_data]
15
+
16
+ df = pd.DataFrame.from_records(all_data_json)
17
+ df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
18
+ df = df[cols].round(decimals=2)
19
+
20
+ # filter out if any of the benchmarks have not been produced
21
+ df = df[has_no_nan_values(df, benchmark_cols)]
22
+ return df
23
+
24
+
25
+ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
26
+ """Creates the different dataframes for the evaluation queues requestes"""
27
+ entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
28
+ all_evals = []
29
+
30
+ for entry in entries:
31
+ if ".json" in entry:
32
+ file_path = os.path.join(save_path, entry)
33
+ with open(file_path) as fp:
34
+ data = json.load(fp)
35
+
36
+ data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
37
+ data[EvalQueueColumn.revision.name] = data.get("revision", "main")
38
+
39
+ all_evals.append(data)
40
+ elif ".md" not in entry:
41
+ # this is a folder
42
+ sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if os.path.isfile(e) and not e.startswith(".")]
43
+ for sub_entry in sub_entries:
44
+ file_path = os.path.join(save_path, entry, sub_entry)
45
+ with open(file_path) as fp:
46
+ data = json.load(fp)
47
+
48
+ data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
49
+ data[EvalQueueColumn.revision.name] = data.get("revision", "main")
50
+ all_evals.append(data)
51
+
52
+ pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
53
+ running_list = [e for e in all_evals if e["status"] == "RUNNING"]
54
+ finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
55
+ df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
56
+ df_running = pd.DataFrame.from_records(running_list, columns=cols)
57
+ df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
58
+ return df_finished[cols], df_running[cols], df_pending[cols]
src/submission/__init__.py DELETED
@@ -1 +0,0 @@
1
- # Submission handling module
 
 
src/submission/check_validity.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import re
4
+ from collections import defaultdict
5
+ from datetime import datetime, timedelta, timezone
6
+
7
+ import huggingface_hub
8
+ from huggingface_hub import ModelCard
9
+ from huggingface_hub.hf_api import ModelInfo
10
+ from transformers import AutoConfig
11
+ from transformers.models.auto.tokenization_auto import AutoTokenizer
12
+
13
+ def check_model_card(repo_id: str) -> tuple[bool, str]:
14
+ """Checks if the model card and license exist and have been filled"""
15
+ try:
16
+ card = ModelCard.load(repo_id)
17
+ except huggingface_hub.utils.EntryNotFoundError:
18
+ return False, "Please add a model card to your model to explain how you trained/fine-tuned it."
19
+
20
+ # Enforce license metadata
21
+ if card.data.license is None:
22
+ if not ("license_name" in card.data and "license_link" in card.data):
23
+ return False, (
24
+ "License not found. Please add a license to your model card using the `license` metadata or a"
25
+ " `license_name`/`license_link` pair."
26
+ )
27
+
28
+ # Enforce card content
29
+ if len(card.text) < 200:
30
+ return False, "Please add a description to your model card, it is too short."
31
+
32
+ return True, ""
33
+
34
+ def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str]:
35
+ """Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses."""
36
+ try:
37
+ config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
38
+ if test_tokenizer:
39
+ try:
40
+ tk = AutoTokenizer.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
41
+ except ValueError as e:
42
+ return (
43
+ False,
44
+ f"uses a tokenizer which is not in a transformers release: {e}",
45
+ None
46
+ )
47
+ except Exception as e:
48
+ return (False, "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?", None)
49
+ return True, None, config
50
+
51
+ except ValueError:
52
+ return (
53
+ False,
54
+ "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
55
+ None
56
+ )
57
+
58
+ except Exception as e:
59
+ return False, "was not found on hub!", None
60
+
61
+
62
+ def get_model_size(model_info: ModelInfo, precision: str):
63
+ """Gets the model size from the configuration, or the model name if the configuration does not contain the information."""
64
+ try:
65
+ model_size = round(model_info.safetensors["total"] / 1e9, 3)
66
+ except (AttributeError, TypeError):
67
+ return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
68
+
69
+ size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.modelId.lower()) else 1
70
+ model_size = size_factor * model_size
71
+ return model_size
72
+
73
+ def get_model_arch(model_info: ModelInfo):
74
+ """Gets the model architecture from the configuration"""
75
+ return model_info.config.get("architectures", "Unknown")
76
+
77
+ def already_submitted_models(requested_models_dir: str) -> set[str]:
78
+ """Gather a list of already submitted models to avoid duplicates"""
79
+ depth = 1
80
+ file_names = []
81
+ users_to_submission_dates = defaultdict(list)
82
+
83
+ for root, _, files in os.walk(requested_models_dir):
84
+ current_depth = root.count(os.sep) - requested_models_dir.count(os.sep)
85
+ if current_depth == depth:
86
+ for file in files:
87
+ if not file.endswith(".json"):
88
+ continue
89
+ with open(os.path.join(root, file), "r") as f:
90
+ info = json.load(f)
91
+ file_names.append(f"{info['model']}_{info['revision']}_{info['precision']}")
92
+
93
+ # Select organisation
94
+ if info["model"].count("/") == 0 or "submitted_time" not in info:
95
+ continue
96
+ organisation, _ = info["model"].split("/")
97
+ users_to_submission_dates[organisation].append(info["submitted_time"])
98
+
99
+ return set(file_names), users_to_submission_dates
src/submission/submit.py CHANGED
@@ -1,184 +1,119 @@
1
- """
2
- Handle submissions to the CodeReview Bench leaderboard.
3
- """
4
-
5
  import json
6
  import os
7
- import tempfile
8
- from datetime import datetime
9
- from typing import Dict, List, Tuple
10
-
11
- from huggingface_hub import HfApi
12
- from datasets import load_dataset
13
-
14
- from src.display.formatting import styled_error, styled_message
15
- from src.envs import RESULTS_DATASET_ID, TOKEN, REPO_ID
16
- from src.leaderboard.processor import process_jsonl_submission, add_entries_to_leaderboard
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
 
18
 
19
- def validate_submission(file_path: str) -> Tuple[bool, str]:
20
- """
21
- Validate a submission file.
22
- """
23
- try:
24
- entries, message = process_jsonl_submission(file_path)
25
- if not entries:
26
- return False, message
27
- return True, "Submission is valid"
28
- except Exception as e:
29
- return False, f"Error validating submission: {e}"
30
-
31
-
32
- def submit_entry_to_hub(entry: Dict, model_name: str, mode: str, version="v0") -> Tuple[bool, str]:
33
- """
34
- Submit a model's evaluation entry to the HuggingFace dataset. The entry is uniquely identified by model_name, mode, and version.
35
- """
36
- try:
37
- # Create safe model name for file path
38
- model_name_safe = model_name.replace("/", "_").replace(" ", "_")
39
- mode_safe = str(mode).replace("/", "_").replace(" ", "_").lower()
40
-
41
- # Create entry path in entries folder
42
- entry_path = f"entries/entry_{model_name_safe}_{mode_safe}_{version}.json"
43
-
44
- # Save entry to temporary file
45
- with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as temp_file:
46
- json.dump(entry, temp_file, indent=2)
47
- temp_path = temp_file.name
48
-
49
- # Upload file
50
- api = HfApi(token=TOKEN)
51
- api.upload_file(
52
- path_or_fileobj=temp_path,
53
- path_in_repo=entry_path,
54
- repo_id=RESULTS_DATASET_ID,
55
- repo_type="dataset",
56
- commit_message=f"Add evaluation entry for {model_name} (mode {mode}, version {version})"
57
- )
58
-
59
- os.unlink(temp_path)
60
- return True, f"Successfully uploaded evaluation entry for {model_name} (mode {mode})"
61
- except Exception as e:
62
- return False, f"Error submitting entry to dataset: {e}"
63
-
64
-
65
- def submit_leaderboard_to_hub(entries: List[Dict], version="v0") -> Tuple[bool, str]:
66
- """
67
- Submit updated leaderboard to the HuggingFace dataset.
68
- """
69
- try:
70
- # Create leaderboard data
71
- leaderboard_data = {
72
- "entries": entries,
73
- "last_updated": datetime.now().isoformat(),
74
- "version": version
75
- }
76
-
77
- # Save to temporary file
78
- with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as temp_file:
79
- json.dump(leaderboard_data, temp_file, indent=2)
80
- temp_path = temp_file.name
81
-
82
- # Upload file
83
- api = HfApi(token=TOKEN)
84
- api.upload_file(
85
- path_or_fileobj=temp_path,
86
- path_in_repo=f"leaderboards/leaderboard_{version}.json",
87
- repo_id=RESULTS_DATASET_ID,
88
- repo_type="dataset",
89
- commit_message=f"Update leaderboard for version {version}"
90
- )
91
-
92
- os.unlink(temp_path)
93
- return True, "Leaderboard updated successfully"
94
- except Exception as e:
95
- return False, f"Error updating leaderboard: {e}"
96
-
97
-
98
- def process_submission(file_path: str, metadata: Dict, version="v0") -> str:
99
- """
100
- Process a submission to the CodeReview Bench leaderboard.
101
- """
102
  try:
103
- # Validate submission
104
- is_valid, validation_message = validate_submission(file_path)
105
- if not is_valid:
106
- return styled_error(validation_message)
107
-
108
- # Process the submission entries
109
- entries, message = process_jsonl_submission(file_path)
110
- if not entries:
111
- return styled_error(f"Failed to process submission: {message}")
112
-
113
- # Upload raw submission file
114
- model_name = metadata.get("model_name", "unknown")
115
- model_name_safe = model_name.replace("/", "_").replace(" ", "_")
116
-
117
- api = HfApi(token=TOKEN)
118
- submission_path = f"submissions_{version}/{model_name_safe}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.jsonl"
119
- api.upload_file(
120
- path_or_fileobj=file_path,
121
- path_in_repo=submission_path,
122
- repo_id=RESULTS_DATASET_ID,
123
- repo_type="dataset",
124
- commit_message=f"Add raw submission for {model_name}"
125
- )
126
-
127
- # Process entries and add metadata
128
- processed_entries = []
129
- for entry in entries:
130
- # Add metadata to entry
131
- entry.update({
132
- "model_name": metadata.get("model_name"),
133
- "model_type": metadata.get("model_type"),
134
- "review_model_type": str(metadata.get("review_model_type", "custom")).lower(),
135
- "mode": metadata.get("mode"),
136
- "base_model": metadata.get("base_model"),
137
- "revision": metadata.get("revision"),
138
- "precision": metadata.get("precision"),
139
- "weight_type": metadata.get("weight_type"),
140
- "version": version,
141
- "submission_date": datetime.now().isoformat()
142
- })
143
- processed_entries.append(entry)
144
-
145
- # Submit entries to entries folder
146
- for entry in processed_entries:
147
- success, message = submit_entry_to_hub(entry, model_name, metadata.get("mode"), version)
148
- if not success:
149
- return styled_error(message)
150
-
151
- # Get all entries from HF dataset and update leaderboard
152
- files = api.list_repo_files(repo_id=RESULTS_DATASET_ID, repo_type="dataset")
153
- entry_files = [f for f in files if f.startswith("entries/") and f.endswith(f"_{version}.json")]
154
-
155
- all_entries = []
156
- for entry_file in entry_files:
157
- try:
158
- entry_path = api.hf_hub_download(
159
- repo_id=RESULTS_DATASET_ID,
160
- filename=entry_file,
161
- repo_type="dataset",
162
- )
163
- with open(entry_path, 'r') as f:
164
- entry_data = json.load(f)
165
- all_entries.append(entry_data)
166
- except Exception as e:
167
- print(f"Error loading entry {entry_file}: {e}")
168
-
169
- # Update leaderboard with all entries
170
- success, message = submit_leaderboard_to_hub(all_entries, version)
171
- if not success:
172
- return styled_error(message)
173
-
174
- return styled_message("Submission successful! Model evaluated and leaderboard updated.")
175
-
176
- except Exception as e:
177
- return styled_error(f"Error processing submission: {e}")
178
- finally:
179
- # Clean up temporary files if they exist
180
- try:
181
- if os.path.exists(file_path):
182
- os.remove(file_path)
183
- except:
184
- pass
 
 
 
 
 
1
  import json
2
  import os
3
+ from datetime import datetime, timezone
4
+
5
+ from src.display.formatting import styled_error, styled_message, styled_warning
6
+ from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO
7
+ from src.submission.check_validity import (
8
+ already_submitted_models,
9
+ check_model_card,
10
+ get_model_size,
11
+ is_model_on_hub,
12
+ )
13
+
14
+ REQUESTED_MODELS = None
15
+ USERS_TO_SUBMISSION_DATES = None
16
+
17
+ def add_new_eval(
18
+ model: str,
19
+ base_model: str,
20
+ revision: str,
21
+ precision: str,
22
+ weight_type: str,
23
+ model_type: str,
24
+ ):
25
+ global REQUESTED_MODELS
26
+ global USERS_TO_SUBMISSION_DATES
27
+ if not REQUESTED_MODELS:
28
+ REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
29
+
30
+ user_name = ""
31
+ model_path = model
32
+ if "/" in model:
33
+ user_name = model.split("/")[0]
34
+ model_path = model.split("/")[1]
35
+
36
+ precision = precision.split(" ")[0]
37
+ current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
38
+
39
+ if model_type is None or model_type == "":
40
+ return styled_error("Please select a model type.")
41
+
42
+ # Does the model actually exist?
43
+ if revision == "":
44
+ revision = "main"
45
+
46
+ # Is the model on the hub?
47
+ if weight_type in ["Delta", "Adapter"]:
48
+ base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=TOKEN, test_tokenizer=True)
49
+ if not base_model_on_hub:
50
+ return styled_error(f'Base model "{base_model}" {error}')
51
+
52
+ if not weight_type == "Adapter":
53
+ model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, token=TOKEN, test_tokenizer=True)
54
+ if not model_on_hub:
55
+ return styled_error(f'Model "{model}" {error}')
56
+
57
+ # Is the model info correctly filled?
58
+ try:
59
+ model_info = API.model_info(repo_id=model, revision=revision)
60
+ except Exception:
61
+ return styled_error("Could not get your model information. Please fill it up properly.")
62
 
63
+ model_size = get_model_size(model_info=model_info, precision=precision)
64
 
65
+ # Were the model card and license filled?
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  try:
67
+ license = model_info.cardData["license"]
68
+ except Exception:
69
+ return styled_error("Please select a license for your model")
70
+
71
+ modelcard_OK, error_msg = check_model_card(model)
72
+ if not modelcard_OK:
73
+ return styled_error(error_msg)
74
+
75
+ # Seems good, creating the eval
76
+ print("Adding new eval")
77
+
78
+ eval_entry = {
79
+ "model": model,
80
+ "base_model": base_model,
81
+ "revision": revision,
82
+ "precision": precision,
83
+ "weight_type": weight_type,
84
+ "status": "PENDING",
85
+ "submitted_time": current_time,
86
+ "model_type": model_type,
87
+ "likes": model_info.likes,
88
+ "params": model_size,
89
+ "license": license,
90
+ "private": False,
91
+ }
92
+
93
+ # Check for duplicate submission
94
+ if f"{model}_{revision}_{precision}" in REQUESTED_MODELS:
95
+ return styled_warning("This model has been already submitted.")
96
+
97
+ print("Creating eval file")
98
+ OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
99
+ os.makedirs(OUT_DIR, exist_ok=True)
100
+ out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json"
101
+
102
+ with open(out_path, "w") as f:
103
+ f.write(json.dumps(eval_entry))
104
+
105
+ print("Uploading eval file")
106
+ API.upload_file(
107
+ path_or_fileobj=out_path,
108
+ path_in_repo=out_path.split("eval-queue/")[1],
109
+ repo_id=QUEUE_REPO,
110
+ repo_type="dataset",
111
+ commit_message=f"Add {model} to eval queue",
112
+ )
113
+
114
+ # Remove the local file
115
+ os.remove(out_path)
116
+
117
+ return styled_message(
118
+ "Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list."
119
+ )