brunneis commited on
Commit
e9f2fe4
1 Parent(s): be8b8f3

Refactor and improve space and leaderboard initialization

Browse files

- Added [`ignore/`]
- Refactored space initialization in [`app.py`]
- Updated labels and filters in the [`init_leaderboard`]
- Removed evaluation script in `about.py`.
- Updated labels in [`src/display/utils.py`]
- Modified queue and results repositories in [`src/envs.py`]

Files changed (5) hide show
  1. .gitignore +1 -0
  2. app.py +25 -16
  3. src/about.py +2 -234
  4. src/display/utils.py +3 -3
  5. src/envs.py +2 -2
.gitignore CHANGED
@@ -1,3 +1,4 @@
 
1
  auto_evals/
2
  venv/
3
  __pycache__/
 
1
+ ignore/
2
  auto_evals/
3
  venv/
4
  __pycache__/
app.py CHANGED
@@ -36,24 +36,37 @@ from src.submission.submit import add_new_eval
36
  def restart_space():
37
  API.restart_space(repo_id=REPO_ID)
38
 
39
- ### Space initialisation
40
  try:
41
- print(EVAL_REQUESTS_PATH)
42
  snapshot_download(
43
- repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
 
 
 
 
 
44
  )
45
  except Exception:
46
  restart_space()
47
  try:
48
- print(EVAL_RESULTS_PATH)
49
  snapshot_download(
50
- repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
 
 
 
 
 
51
  )
52
  except Exception:
53
  restart_space()
54
 
55
 
56
- LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
 
 
 
 
 
57
 
58
  (
59
  finished_eval_queue_df,
@@ -70,25 +83,21 @@ def init_leaderboard(dataframe):
70
  select_columns=SelectColumns(
71
  default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
72
  cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
73
- label="Select Columns to Display:",
74
  ),
75
  search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
76
  hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
77
  filter_columns=[
78
- ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
79
- ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
80
  ColumnFilter(
81
  AutoEvalColumn.params.name,
82
  type="slider",
83
- min=0.01,
84
- max=150,
85
- label="Select the number of parameters (B)",
86
- ),
87
- ColumnFilter(
88
- AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=False
89
  ),
90
  ],
91
- bool_checkboxgroup_label="Hide models",
92
  interactive=False,
93
  )
94
 
 
36
  def restart_space():
37
  API.restart_space(repo_id=REPO_ID)
38
 
39
+ # Space initialisation
40
  try:
 
41
  snapshot_download(
42
+ repo_id=QUEUE_REPO,
43
+ local_dir=EVAL_REQUESTS_PATH,
44
+ repo_type="dataset",
45
+ tqdm_class=None,
46
+ etag_timeout=30,
47
+ token=TOKEN,
48
  )
49
  except Exception:
50
  restart_space()
51
  try:
 
52
  snapshot_download(
53
+ repo_id=RESULTS_REPO,
54
+ local_dir=EVAL_RESULTS_PATH,
55
+ repo_type="dataset",
56
+ tqdm_class=None,
57
+ etag_timeout=30,
58
+ token=TOKEN,
59
  )
60
  except Exception:
61
  restart_space()
62
 
63
 
64
+ LEADERBOARD_DF = get_leaderboard_df(
65
+ EVAL_RESULTS_PATH,
66
+ EVAL_REQUESTS_PATH,
67
+ COLS,
68
+ BENCHMARK_COLS,
69
+ )
70
 
71
  (
72
  finished_eval_queue_df,
 
83
  select_columns=SelectColumns(
84
  default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
85
  cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
86
+ label="Columns",
87
  ),
88
  search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
89
  hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
90
  filter_columns=[
91
+ ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Floating-point format"),
 
92
  ColumnFilter(
93
  AutoEvalColumn.params.name,
94
  type="slider",
95
+ min=1,
96
+ max=500,
97
+ step=1,
98
+ label="Number of parameters (billions)",
 
 
99
  ),
100
  ],
 
101
  interactive=False,
102
  )
103
 
src/about.py CHANGED
@@ -76,238 +76,6 @@ If your model is displayed in the `FAILED` category, its execution stopped.
76
  Make sure you have followed the above steps first.
77
  If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
78
  """
79
- EVALUATION_SCRIPT = '''
80
- To evaluate the model you can access the colab notebook at [this link](https://colab.research.google.com/drive/145KAGvgdAb8BrkObUrxAVWBd9EGDqy8N?usp=sharing).
81
-
82
- ## First install the necessary libraries
83
-
84
- ```
85
- pip install accelerate openai anthropic datasets
86
- ```
87
-
88
- ## Setup your :
89
- * OPENAI_API_KEY
90
- * ANTHROPIC_API_KEY
91
- * HF_TOKEN
92
-
93
- ## Select a model
94
-
95
- ```python
96
- MODEL_ID = # model_id_here
97
- ```
98
-
99
- ## Then run the following script
100
-
101
- ````python
102
- from transformers import pipeline
103
- import torch
104
- import os
105
- import json
106
- from openai import OpenAI
107
- import anthropic
108
- from huggingface_hub.utils._token import get_token
109
- from huggingface_hub import InferenceClient
110
- HF_TOKEN = get_token()
111
-
112
- from datasets import load_dataset
113
-
114
- ds = load_dataset("braindao/solbench-naive-judge-random-v1",split="test")
115
-
116
-
117
- pipe = pipeline("text-generation", model= MODEL_ID , torch_dtype=torch.bfloat16, device_map="auto")
118
-
119
- def generate(message):
120
- messages = [
121
- {"role": "user", "content": message},
122
- ]
123
- return pipe(messages,max_new_tokens=1024)[0]["generated_text"][1]["content"]
124
-
125
- def convert_to_int(text):
126
- value = 0
127
- try :
128
- value = int(text)
129
- except :
130
- pass
131
- return value
132
-
133
- def anthropic_judge(code,baseline):
134
- prompt = f"""Analyze the provided Solidity code and assign a score from 0 to 10 based on these criteria:
135
-
136
- 1. Functionality (0-2 points)
137
- 2. Security (0-2 points)
138
- 3. Efficiency (0-2 points)
139
- 4. Readability and Style (0-2 points)
140
- 5. Similarity with the Expert Code (0-2 points)
141
-
142
- We
143
- Evaluate the code thoroughly, sum up the points, and return ONLY an integer value representing the final score. Your entire response should consist of a single integer between 0 and 10, inclusive.
144
-
145
- Solidity code to evaluate:
146
- ```solidity
147
- {code}
148
- ```
149
-
150
- Expert Code:
151
- ```solidity
152
- {baseline}
153
- ```
154
-
155
- OUTPUT FORMAT: [integer]"""
156
-
157
-
158
- sys = """You are a solidity code judge,
159
- You will only reply with an integer value between 0-10"""
160
-
161
- client = anthropic.Anthropic()
162
-
163
- message = client.messages.create(
164
- model="claude-3-5-sonnet-20240620",
165
- max_tokens=1000,
166
- temperature=0,
167
- system=sys,
168
- messages=[
169
- {
170
- "role": "user",
171
- "content": [
172
- {
173
- "type": "text",
174
- "text": prompt
175
- }
176
- ]
177
- }
178
- ]
179
- )
180
- return convert_to_int(message.content[0].text)
181
-
182
-
183
- def openai_judge(code,baseline):
184
- prompt = f"""evaluate the following solidity code and return a score between 0 and 10 based how far the code achieves the following criteria:
185
-
186
- 1. Functionality (0-2 points)
187
- 2. Security (0-2 points)
188
- 3. Efficiency (0-2 points)
189
- 4. Readability and Style (0-2 points)
190
- 5. Similarity with the Expert Code (0-2 points)
191
-
192
- code to evaluate:
193
- {code}
194
-
195
- expert code:
196
- {baseline}
197
-
198
- return only an integer value and no additional comment, score should be either 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 or 10.
199
- """
200
- client = OpenAI()
201
- completion = client.chat.completions.create(
202
- model="gpt-4o",
203
- messages=[
204
- {"role": "user", "content": prompt}
205
- ]
206
- )
207
- return convert_to_int(completion.choices[0].message.content)
208
-
209
-
210
- def hf_judge(code,baseline):
211
- prompt = f"""evaluate the following solidity code and return a score between 0 and 10 based how far the code achieves the following criteria:
212
-
213
- 1. Functionality (0-2 points)
214
- 2. Security (0-2 points)
215
- 3. Efficiency (0-2 points)
216
- 4. Readability and Style (0-2 points)
217
- 5. Similarity with the Expert Code (0-2 points)
218
-
219
- code to evaluate:
220
- {code}
221
-
222
- expert code:
223
- {baseline}
224
-
225
- return only an integer value and no additional comment, score should be either 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 or 10.
226
- """
227
- client = InferenceClient(
228
- "meta-llama/Meta-Llama-3.1-405B-Instruct",
229
- token=HF_TOKEN,
230
- )
231
- out = ""
232
- try :
233
- for message in client.chat_completion(
234
- messages=[{"role":"system","content" : "you are a solidity code judge, you will only reply with an integer value between 0-10"},
235
- {"role": "user", "content": prompt}],
236
- max_tokens=500,
237
- stream=True,
238
- ):
239
- out += message.choices[0].delta.content
240
- except :
241
- pass
242
- return convert_to_int(out)
243
-
244
- def LLM_JUDGE(code,baseline,judges=["openai","anthropic","hf"]) :
245
- out = {}
246
- if "openai" in judges :
247
- out["openai"] = openai_judge(code,baseline)
248
- if "anthropic" in judges :
249
- out["anthropic"] = anthropic_judge(code,baseline)
250
- if "hf" in judges :
251
- out["hf"] = hf_judge(code,baseline)
252
- return out
253
-
254
- # Judge model against data
255
- from tqdm import tqdm
256
- scores = {"openai":[],"anthropic":[],"hf":[]}
257
- for sample in tqdm(ds) :
258
- score = evaluate_sample(sample)
259
- for key in score.keys():
260
- scores[key].append(score[key])
261
-
262
- # normalize scores
263
- for key in scores.keys():
264
- scores[key] = sum(scores[key])/(10*len(scores[key]))
265
-
266
-
267
- d = {
268
- "config": {
269
- "model_dtype": "torch.bfloat16",
270
- "model_name": MODEL_ID,
271
- "model_sha": "main"
272
- },
273
- "results": {
274
- "openai": {
275
- "score": 0
276
- },
277
- "anthropic": {
278
- "score": 0
279
- },
280
- "hf": {
281
- "score": 0
282
- }
283
- }
284
- }
285
-
286
- for key in scores.keys() :
287
- d["results"][key]["score"] = scores[key]
288
-
289
-
290
- # Serializing json
291
- json_object = json.dumps(d, indent=4)
292
-
293
- # Writing to sample.json
294
- file_name = MODEL_ID.split("/")[1] + ".json"
295
- with open(file_name, "w") as outfile:
296
- outfile.write(json_object)
297
-
298
- ````
299
-
300
- ## if you are not part of braindao set `create_pr` to **True**
301
- ```python
302
- from huggingface_hub import upload_file
303
- upload_file(path_or_fileobj = file_name,
304
- path_in_repo=f"{MODEL_ID}.json",
305
- repo_id="braindao/results",
306
- repo_type="dataset",
307
- create_pr=False)
308
- ```
309
-
310
- '''
311
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
312
- CITATION_BUTTON_TEXT = r"""
313
- """
 
76
  Make sure you have followed the above steps first.
77
  If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
78
  """
79
+ EVALUATION_SCRIPT = ''
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
81
+ CITATION_BUTTON_TEXT = ''
 
src/display/utils.py CHANGED
@@ -42,9 +42,9 @@ auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type",
42
  auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
43
  auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
44
  auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
45
- auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
46
- auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
47
- auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
48
  auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
49
  auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
50
 
 
42
  auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
43
  auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
44
  auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
45
+ auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("License", "str", False)])
46
+ auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("Parameters ⚙️", "number", False)])
47
+ auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Likes ❤️", "number", False)])
48
  auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
49
  auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
50
 
src/envs.py CHANGED
@@ -14,8 +14,8 @@ OWNER = "braindao" # Change to your org - don't forget to create a results and
14
  # ----------------------------------
15
 
16
  REPO_ID = f"{OWNER}/solidity-leaderboard"
17
- QUEUE_REPO = f"{OWNER}/requests"
18
- RESULTS_REPO = f"{OWNER}/results"
19
 
20
  # If you setup a cache later, just change HF_HOME
21
  CACHE_PATH = os.getenv("HF_HOME", ".")
 
14
  # ----------------------------------
15
 
16
  REPO_ID = f"{OWNER}/solidity-leaderboard"
17
+ QUEUE_REPO = f"{OWNER}/solbench-leaderboard-queue"
18
+ RESULTS_REPO = f"{OWNER}/solbench-leaderboard-results"
19
 
20
  # If you setup a cache later, just change HF_HOME
21
  CACHE_PATH = os.getenv("HF_HOME", ".")