File size: 8,962 Bytes
1613f96
 
 
cc7bfac
1613f96
 
 
 
 
 
 
 
 
 
 
a8bf9ff
 
 
 
 
 
e9469f3
 
a8bf9ff
1613f96
 
 
 
 
16e1e6b
 
1613f96
 
 
16e1e6b
1613f96
 
 
9b88843
1613f96
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22ec7d4
811fa97
1613f96
197af1c
c430a49
811fa97
 
 
c430a49
197af1c
811fa97
 
 
 
197af1c
c430a49
 
811fa97
 
c430a49
197af1c
c430a49
 
811fa97
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69139f5
811fa97
197af1c
811fa97
 
 
 
 
 
 
 
 
22ec7d4
1613f96
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
from dataclasses import dataclass
from enum import Enum


@dataclass
class Task:
    benchmark: str
    metric: str
    col_name: str


# Select your tasks here
# ---------------------------------------------------
class Tasks(Enum):
    # task_key in the json file, metric_key in the json file, name to display in the leaderboard 
    task0 = Task("hf", "score", "LLM JUDGE")
    task1 = Task("rouge1", "score", "ROUGE-unigrams")
    task2 = Task("rouge2", "score", "ROUGE-bigrams")
    task3 = Task("rougeL", "score", "ROUGE-Longest Common Subsequence")
    task4 = Task("rougeLsum", "score", "ROUGE-Lsum")
    task5 = Task("bleu", "score", "Bleu")
    task6 = Task("brevity_penalty", "score", "Brevity Penalty")
    task7 = Task("hf", "score", "LLM JUDGE")
    # task = Task("precisions", "score", "Precision")

NUM_FEWSHOT = 0 # Change with your few shot
# ---------------------------------------------------

# Your leaderboard name
TITLE = """<img src="https://iq.wiki/branding/downloadassets/logooriginalbrain.svg" height="40"  style="display: block; margin-left: auto; margin-right: auto;">
<h1 align="center" id="space-title"></h1>"""

# What does your leaderboard evaluate?
INTRODUCTION_TEXT = """
<center>Solbench: Evaluating Solidity Code Generation</center>
"""

# Which evaluations are you running? how can people reproduce what you have?
LLM_BENCHMARKS_TEXT = """
## How it works

## Reproducibility
To reproduce our results, here is the commands you can run:

"""

EVALUATION_QUEUE_TEXT = """
## Some good practices before submitting a model

### 1) Make sure you can load your model and tokenizer using AutoClasses:
```python
from transformers import AutoConfig, AutoModel, AutoTokenizer
config = AutoConfig.from_pretrained("your model name", revision=revision)
model = AutoModel.from_pretrained("your model name", revision=revision)
tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
```
If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.

Note: make sure your model is public!
Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!

### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!

### 3) Make sure your model has an open license!
This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗

### 4) Fill up your model card
When we add extra information about models to the leaderboard, it will be automatically taken from the model card

## In case of model failure
If your model is displayed in the `FAILED` category, its execution stopped.
Make sure you have followed the above steps first.
If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
"""
EVALUATION_SCRIPT = '''
To evaluate the model you can access the colab notebook at [this link](https://colab.research.google.com/drive/145KAGvgdAb8BrkObUrxAVWBd9EGDqy8N?usp=sharing).

## First install the necessary libraries 

```
pip install accelerate openai anthropic datasets
```

## Setup your : 
* OPENAI_API_KEY
* ANTHROPIC_API_KEY
* HF_TOKEN

## Select a model

```python
MODEL_ID = # model_id_here
```

## Then run the following script

````python
from transformers import pipeline
import torch
import os
import json
from openai import OpenAI
import anthropic
from huggingface_hub.utils._token import get_token
from huggingface_hub import InferenceClient
HF_TOKEN = get_token()

from datasets import load_dataset

ds = load_dataset("braindao/solbench-naive-judge-random-v1",split="test")


pipe = pipeline("text-generation", model= MODEL_ID , torch_dtype=torch.bfloat16, device_map="auto")

def generate(message):
  messages = [
      {"role": "user", "content": message},
  ]
  return pipe(messages,max_new_tokens=1024)[0]["generated_text"][1]["content"]

def convert_to_int(text):
  value = 0
  try :
    value = int(text)
  except :
    pass
  return value

def anthropic_judge(code,baseline):
  prompt = f"""Analyze the provided Solidity code and assign a score from 0 to 10 based on these criteria:

1. Functionality (0-2 points)
2. Security (0-2 points)
3. Efficiency (0-2 points)
4. Readability and Style (0-2 points)
5. Similarity with the Expert Code (0-2 points)

We
Evaluate the code thoroughly, sum up the points, and return ONLY an integer value representing the final score. Your entire response should consist of a single integer between 0 and 10, inclusive.

Solidity code to evaluate:
```solidity
{code}
```

Expert Code:
```solidity
{baseline}
```

OUTPUT FORMAT: [integer]"""


  sys = """You are a solidity code judge,
  You will only reply with an integer value between 0-10"""

  client = anthropic.Anthropic()

  message = client.messages.create(
      model="claude-3-5-sonnet-20240620",
      max_tokens=1000,
      temperature=0,
      system=sys,
      messages=[
          {
              "role": "user",
              "content": [
                  {
                      "type": "text",
                      "text": prompt
                  }
              ]
          }
      ]
  )
  return convert_to_int(message.content[0].text)


def openai_judge(code,baseline):
  prompt = f"""evaluate the following solidity code and return a score between 0 and 10 based how far the code achieves the following criteria:

1. Functionality (0-2 points)
2. Security (0-2 points)
3. Efficiency (0-2 points)
4. Readability and Style (0-2 points)
5. Similarity with the Expert Code (0-2 points)

code to evaluate:
{code}

expert code:
{baseline}

return only an integer value and no additional comment, score should be either 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 or 10.
"""
  client = OpenAI()
  completion = client.chat.completions.create(
      model="gpt-4o",
      messages=[
          {"role": "user", "content": prompt}
      ]
  )
  return convert_to_int(completion.choices[0].message.content)


def hf_judge(code,baseline):
  prompt = f"""evaluate the following solidity code and return a score between 0 and 10 based how far the code achieves the following criteria:

1. Functionality (0-2 points)
2. Security (0-2 points)
3. Efficiency (0-2 points)
4. Readability and Style (0-2 points)
5. Similarity with the Expert Code (0-2 points)

code to evaluate:
{code}

expert code:
{baseline}

return only an integer value and no additional comment, score should be either 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 or 10.
"""
  client = InferenceClient(
      "meta-llama/Meta-Llama-3.1-405B-Instruct",
      token=HF_TOKEN,
  )
  out = ""
  try :
    for message in client.chat_completion(
      messages=[{"role":"system","content" : "you are a solidity code judge, you will only reply with an integer value between 0-10"},
          {"role": "user", "content": prompt}],
      max_tokens=500,
      stream=True,
    ):
        out += message.choices[0].delta.content
  except :
    pass
  return convert_to_int(out)

def LLM_JUDGE(code,baseline,judges=["openai","anthropic","hf"]) :
  out = {}
  if "openai" in judges :
    out["openai"] = openai_judge(code,baseline)
  if "anthropic" in judges :
    out["anthropic"] = anthropic_judge(code,baseline)
  if "hf" in judges :
    out["hf"] = hf_judge(code,baseline)
  return out

# Judge model against data
from tqdm import tqdm
scores = {"openai":[],"anthropic":[],"hf":[]}
for sample in tqdm(ds) :
  score =  evaluate_sample(sample)
  for key in score.keys():
    scores[key].append(score[key])

# normalize scores
for key in scores.keys():
  scores[key] = sum(scores[key])/(10*len(scores[key]))


d = {
    "config": {
        "model_dtype": "torch.bfloat16",
        "model_name": MODEL_ID,
        "model_sha": "main"
    },
    "results": {
        "openai": {
            "score": 0
        },
        "anthropic": {
            "score": 0
        },
        "hf": {
            "score": 0
        }
    }
}

for key in scores.keys() :
  d["results"][key]["score"] = scores[key]


# Serializing json
json_object = json.dumps(d, indent=4)

# Writing to sample.json
file_name = MODEL_ID.split("/")[1] + ".json"
with open(file_name, "w") as outfile:
    outfile.write(json_object)

````

## if you are not part of braindao set `create_pr` to **True**
```python
from huggingface_hub import upload_file
upload_file(path_or_fileobj = file_name,
            path_in_repo=f"{MODEL_ID}.json",
            repo_id="braindao/results",
            repo_type="dataset",
            create_pr=False)
```

'''
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
CITATION_BUTTON_TEXT = r"""
"""