Refactor and improve space and leaderboard initialization
Browse files- Added [`ignore/`]
- Refactored space initialization in [`app.py`]
- Updated labels and filters in the [`init_leaderboard`]
- Removed evaluation script in `about.py`.
- Updated labels in [`src/display/utils.py`]
- Modified queue and results repositories in [`src/envs.py`]
- .gitignore +1 -0
- app.py +25 -16
- src/about.py +2 -234
- src/display/utils.py +3 -3
- src/envs.py +2 -2
.gitignore
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
auto_evals/
|
2 |
venv/
|
3 |
__pycache__/
|
|
|
1 |
+
ignore/
|
2 |
auto_evals/
|
3 |
venv/
|
4 |
__pycache__/
|
app.py
CHANGED
@@ -36,24 +36,37 @@ from src.submission.submit import add_new_eval
|
|
36 |
def restart_space():
|
37 |
API.restart_space(repo_id=REPO_ID)
|
38 |
|
39 |
-
|
40 |
try:
|
41 |
-
print(EVAL_REQUESTS_PATH)
|
42 |
snapshot_download(
|
43 |
-
repo_id=QUEUE_REPO,
|
|
|
|
|
|
|
|
|
|
|
44 |
)
|
45 |
except Exception:
|
46 |
restart_space()
|
47 |
try:
|
48 |
-
print(EVAL_RESULTS_PATH)
|
49 |
snapshot_download(
|
50 |
-
repo_id=RESULTS_REPO,
|
|
|
|
|
|
|
|
|
|
|
51 |
)
|
52 |
except Exception:
|
53 |
restart_space()
|
54 |
|
55 |
|
56 |
-
LEADERBOARD_DF = get_leaderboard_df(
|
|
|
|
|
|
|
|
|
|
|
57 |
|
58 |
(
|
59 |
finished_eval_queue_df,
|
@@ -70,25 +83,21 @@ def init_leaderboard(dataframe):
|
|
70 |
select_columns=SelectColumns(
|
71 |
default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
|
72 |
cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
|
73 |
-
label="
|
74 |
),
|
75 |
search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
|
76 |
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
77 |
filter_columns=[
|
78 |
-
ColumnFilter(AutoEvalColumn.
|
79 |
-
ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
|
80 |
ColumnFilter(
|
81 |
AutoEvalColumn.params.name,
|
82 |
type="slider",
|
83 |
-
min=
|
84 |
-
max=
|
85 |
-
|
86 |
-
|
87 |
-
ColumnFilter(
|
88 |
-
AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=False
|
89 |
),
|
90 |
],
|
91 |
-
bool_checkboxgroup_label="Hide models",
|
92 |
interactive=False,
|
93 |
)
|
94 |
|
|
|
36 |
def restart_space():
|
37 |
API.restart_space(repo_id=REPO_ID)
|
38 |
|
39 |
+
# Space initialisation
|
40 |
try:
|
|
|
41 |
snapshot_download(
|
42 |
+
repo_id=QUEUE_REPO,
|
43 |
+
local_dir=EVAL_REQUESTS_PATH,
|
44 |
+
repo_type="dataset",
|
45 |
+
tqdm_class=None,
|
46 |
+
etag_timeout=30,
|
47 |
+
token=TOKEN,
|
48 |
)
|
49 |
except Exception:
|
50 |
restart_space()
|
51 |
try:
|
|
|
52 |
snapshot_download(
|
53 |
+
repo_id=RESULTS_REPO,
|
54 |
+
local_dir=EVAL_RESULTS_PATH,
|
55 |
+
repo_type="dataset",
|
56 |
+
tqdm_class=None,
|
57 |
+
etag_timeout=30,
|
58 |
+
token=TOKEN,
|
59 |
)
|
60 |
except Exception:
|
61 |
restart_space()
|
62 |
|
63 |
|
64 |
+
LEADERBOARD_DF = get_leaderboard_df(
|
65 |
+
EVAL_RESULTS_PATH,
|
66 |
+
EVAL_REQUESTS_PATH,
|
67 |
+
COLS,
|
68 |
+
BENCHMARK_COLS,
|
69 |
+
)
|
70 |
|
71 |
(
|
72 |
finished_eval_queue_df,
|
|
|
83 |
select_columns=SelectColumns(
|
84 |
default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
|
85 |
cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
|
86 |
+
label="Columns",
|
87 |
),
|
88 |
search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
|
89 |
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
90 |
filter_columns=[
|
91 |
+
ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Floating-point format"),
|
|
|
92 |
ColumnFilter(
|
93 |
AutoEvalColumn.params.name,
|
94 |
type="slider",
|
95 |
+
min=1,
|
96 |
+
max=500,
|
97 |
+
step=1,
|
98 |
+
label="Number of parameters (billions)",
|
|
|
|
|
99 |
),
|
100 |
],
|
|
|
101 |
interactive=False,
|
102 |
)
|
103 |
|
src/about.py
CHANGED
@@ -76,238 +76,6 @@ If your model is displayed in the `FAILED` category, its execution stopped.
|
|
76 |
Make sure you have followed the above steps first.
|
77 |
If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
|
78 |
"""
|
79 |
-
EVALUATION_SCRIPT = ''
|
80 |
-
To evaluate the model you can access the colab notebook at [this link](https://colab.research.google.com/drive/145KAGvgdAb8BrkObUrxAVWBd9EGDqy8N?usp=sharing).
|
81 |
-
|
82 |
-
## First install the necessary libraries
|
83 |
-
|
84 |
-
```
|
85 |
-
pip install accelerate openai anthropic datasets
|
86 |
-
```
|
87 |
-
|
88 |
-
## Setup your :
|
89 |
-
* OPENAI_API_KEY
|
90 |
-
* ANTHROPIC_API_KEY
|
91 |
-
* HF_TOKEN
|
92 |
-
|
93 |
-
## Select a model
|
94 |
-
|
95 |
-
```python
|
96 |
-
MODEL_ID = # model_id_here
|
97 |
-
```
|
98 |
-
|
99 |
-
## Then run the following script
|
100 |
-
|
101 |
-
````python
|
102 |
-
from transformers import pipeline
|
103 |
-
import torch
|
104 |
-
import os
|
105 |
-
import json
|
106 |
-
from openai import OpenAI
|
107 |
-
import anthropic
|
108 |
-
from huggingface_hub.utils._token import get_token
|
109 |
-
from huggingface_hub import InferenceClient
|
110 |
-
HF_TOKEN = get_token()
|
111 |
-
|
112 |
-
from datasets import load_dataset
|
113 |
-
|
114 |
-
ds = load_dataset("braindao/solbench-naive-judge-random-v1",split="test")
|
115 |
-
|
116 |
-
|
117 |
-
pipe = pipeline("text-generation", model= MODEL_ID , torch_dtype=torch.bfloat16, device_map="auto")
|
118 |
-
|
119 |
-
def generate(message):
|
120 |
-
messages = [
|
121 |
-
{"role": "user", "content": message},
|
122 |
-
]
|
123 |
-
return pipe(messages,max_new_tokens=1024)[0]["generated_text"][1]["content"]
|
124 |
-
|
125 |
-
def convert_to_int(text):
|
126 |
-
value = 0
|
127 |
-
try :
|
128 |
-
value = int(text)
|
129 |
-
except :
|
130 |
-
pass
|
131 |
-
return value
|
132 |
-
|
133 |
-
def anthropic_judge(code,baseline):
|
134 |
-
prompt = f"""Analyze the provided Solidity code and assign a score from 0 to 10 based on these criteria:
|
135 |
-
|
136 |
-
1. Functionality (0-2 points)
|
137 |
-
2. Security (0-2 points)
|
138 |
-
3. Efficiency (0-2 points)
|
139 |
-
4. Readability and Style (0-2 points)
|
140 |
-
5. Similarity with the Expert Code (0-2 points)
|
141 |
-
|
142 |
-
We
|
143 |
-
Evaluate the code thoroughly, sum up the points, and return ONLY an integer value representing the final score. Your entire response should consist of a single integer between 0 and 10, inclusive.
|
144 |
-
|
145 |
-
Solidity code to evaluate:
|
146 |
-
```solidity
|
147 |
-
{code}
|
148 |
-
```
|
149 |
-
|
150 |
-
Expert Code:
|
151 |
-
```solidity
|
152 |
-
{baseline}
|
153 |
-
```
|
154 |
-
|
155 |
-
OUTPUT FORMAT: [integer]"""
|
156 |
-
|
157 |
-
|
158 |
-
sys = """You are a solidity code judge,
|
159 |
-
You will only reply with an integer value between 0-10"""
|
160 |
-
|
161 |
-
client = anthropic.Anthropic()
|
162 |
-
|
163 |
-
message = client.messages.create(
|
164 |
-
model="claude-3-5-sonnet-20240620",
|
165 |
-
max_tokens=1000,
|
166 |
-
temperature=0,
|
167 |
-
system=sys,
|
168 |
-
messages=[
|
169 |
-
{
|
170 |
-
"role": "user",
|
171 |
-
"content": [
|
172 |
-
{
|
173 |
-
"type": "text",
|
174 |
-
"text": prompt
|
175 |
-
}
|
176 |
-
]
|
177 |
-
}
|
178 |
-
]
|
179 |
-
)
|
180 |
-
return convert_to_int(message.content[0].text)
|
181 |
-
|
182 |
-
|
183 |
-
def openai_judge(code,baseline):
|
184 |
-
prompt = f"""evaluate the following solidity code and return a score between 0 and 10 based how far the code achieves the following criteria:
|
185 |
-
|
186 |
-
1. Functionality (0-2 points)
|
187 |
-
2. Security (0-2 points)
|
188 |
-
3. Efficiency (0-2 points)
|
189 |
-
4. Readability and Style (0-2 points)
|
190 |
-
5. Similarity with the Expert Code (0-2 points)
|
191 |
-
|
192 |
-
code to evaluate:
|
193 |
-
{code}
|
194 |
-
|
195 |
-
expert code:
|
196 |
-
{baseline}
|
197 |
-
|
198 |
-
return only an integer value and no additional comment, score should be either 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 or 10.
|
199 |
-
"""
|
200 |
-
client = OpenAI()
|
201 |
-
completion = client.chat.completions.create(
|
202 |
-
model="gpt-4o",
|
203 |
-
messages=[
|
204 |
-
{"role": "user", "content": prompt}
|
205 |
-
]
|
206 |
-
)
|
207 |
-
return convert_to_int(completion.choices[0].message.content)
|
208 |
-
|
209 |
-
|
210 |
-
def hf_judge(code,baseline):
|
211 |
-
prompt = f"""evaluate the following solidity code and return a score between 0 and 10 based how far the code achieves the following criteria:
|
212 |
-
|
213 |
-
1. Functionality (0-2 points)
|
214 |
-
2. Security (0-2 points)
|
215 |
-
3. Efficiency (0-2 points)
|
216 |
-
4. Readability and Style (0-2 points)
|
217 |
-
5. Similarity with the Expert Code (0-2 points)
|
218 |
-
|
219 |
-
code to evaluate:
|
220 |
-
{code}
|
221 |
-
|
222 |
-
expert code:
|
223 |
-
{baseline}
|
224 |
-
|
225 |
-
return only an integer value and no additional comment, score should be either 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 or 10.
|
226 |
-
"""
|
227 |
-
client = InferenceClient(
|
228 |
-
"meta-llama/Meta-Llama-3.1-405B-Instruct",
|
229 |
-
token=HF_TOKEN,
|
230 |
-
)
|
231 |
-
out = ""
|
232 |
-
try :
|
233 |
-
for message in client.chat_completion(
|
234 |
-
messages=[{"role":"system","content" : "you are a solidity code judge, you will only reply with an integer value between 0-10"},
|
235 |
-
{"role": "user", "content": prompt}],
|
236 |
-
max_tokens=500,
|
237 |
-
stream=True,
|
238 |
-
):
|
239 |
-
out += message.choices[0].delta.content
|
240 |
-
except :
|
241 |
-
pass
|
242 |
-
return convert_to_int(out)
|
243 |
-
|
244 |
-
def LLM_JUDGE(code,baseline,judges=["openai","anthropic","hf"]) :
|
245 |
-
out = {}
|
246 |
-
if "openai" in judges :
|
247 |
-
out["openai"] = openai_judge(code,baseline)
|
248 |
-
if "anthropic" in judges :
|
249 |
-
out["anthropic"] = anthropic_judge(code,baseline)
|
250 |
-
if "hf" in judges :
|
251 |
-
out["hf"] = hf_judge(code,baseline)
|
252 |
-
return out
|
253 |
-
|
254 |
-
# Judge model against data
|
255 |
-
from tqdm import tqdm
|
256 |
-
scores = {"openai":[],"anthropic":[],"hf":[]}
|
257 |
-
for sample in tqdm(ds) :
|
258 |
-
score = evaluate_sample(sample)
|
259 |
-
for key in score.keys():
|
260 |
-
scores[key].append(score[key])
|
261 |
-
|
262 |
-
# normalize scores
|
263 |
-
for key in scores.keys():
|
264 |
-
scores[key] = sum(scores[key])/(10*len(scores[key]))
|
265 |
-
|
266 |
-
|
267 |
-
d = {
|
268 |
-
"config": {
|
269 |
-
"model_dtype": "torch.bfloat16",
|
270 |
-
"model_name": MODEL_ID,
|
271 |
-
"model_sha": "main"
|
272 |
-
},
|
273 |
-
"results": {
|
274 |
-
"openai": {
|
275 |
-
"score": 0
|
276 |
-
},
|
277 |
-
"anthropic": {
|
278 |
-
"score": 0
|
279 |
-
},
|
280 |
-
"hf": {
|
281 |
-
"score": 0
|
282 |
-
}
|
283 |
-
}
|
284 |
-
}
|
285 |
-
|
286 |
-
for key in scores.keys() :
|
287 |
-
d["results"][key]["score"] = scores[key]
|
288 |
-
|
289 |
-
|
290 |
-
# Serializing json
|
291 |
-
json_object = json.dumps(d, indent=4)
|
292 |
-
|
293 |
-
# Writing to sample.json
|
294 |
-
file_name = MODEL_ID.split("/")[1] + ".json"
|
295 |
-
with open(file_name, "w") as outfile:
|
296 |
-
outfile.write(json_object)
|
297 |
-
|
298 |
-
````
|
299 |
-
|
300 |
-
## if you are not part of braindao set `create_pr` to **True**
|
301 |
-
```python
|
302 |
-
from huggingface_hub import upload_file
|
303 |
-
upload_file(path_or_fileobj = file_name,
|
304 |
-
path_in_repo=f"{MODEL_ID}.json",
|
305 |
-
repo_id="braindao/results",
|
306 |
-
repo_type="dataset",
|
307 |
-
create_pr=False)
|
308 |
-
```
|
309 |
-
|
310 |
-
'''
|
311 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
312 |
-
CITATION_BUTTON_TEXT =
|
313 |
-
"""
|
|
|
76 |
Make sure you have followed the above steps first.
|
77 |
If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
|
78 |
"""
|
79 |
+
EVALUATION_SCRIPT = ''
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
80 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
81 |
+
CITATION_BUTTON_TEXT = ''
|
|
src/display/utils.py
CHANGED
@@ -42,9 +42,9 @@ auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type",
|
|
42 |
auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
43 |
auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
|
44 |
auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
|
45 |
-
auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("
|
46 |
-
auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("
|
47 |
-
auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("
|
48 |
auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
49 |
auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
50 |
|
|
|
42 |
auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
43 |
auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
|
44 |
auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
|
45 |
+
auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("License", "str", False)])
|
46 |
+
auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("Parameters ⚙️", "number", False)])
|
47 |
+
auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Likes ❤️", "number", False)])
|
48 |
auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
49 |
auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
50 |
|
src/envs.py
CHANGED
@@ -14,8 +14,8 @@ OWNER = "braindao" # Change to your org - don't forget to create a results and
|
|
14 |
# ----------------------------------
|
15 |
|
16 |
REPO_ID = f"{OWNER}/solidity-leaderboard"
|
17 |
-
QUEUE_REPO = f"{OWNER}/
|
18 |
-
RESULTS_REPO = f"{OWNER}/results"
|
19 |
|
20 |
# If you setup a cache later, just change HF_HOME
|
21 |
CACHE_PATH = os.getenv("HF_HOME", ".")
|
|
|
14 |
# ----------------------------------
|
15 |
|
16 |
REPO_ID = f"{OWNER}/solidity-leaderboard"
|
17 |
+
QUEUE_REPO = f"{OWNER}/solbench-leaderboard-queue"
|
18 |
+
RESULTS_REPO = f"{OWNER}/solbench-leaderboard-results"
|
19 |
|
20 |
# If you setup a cache later, just change HF_HOME
|
21 |
CACHE_PATH = os.getenv("HF_HOME", ".")
|