|
from dataclasses import dataclass |
|
from enum import Enum |
|
|
|
@dataclass |
|
class Task: |
|
benchmark: str |
|
metric: str |
|
col_name: str |
|
|
|
|
|
|
|
|
|
class Tasks(Enum): |
|
|
|
task0 = Task("openai", "score", "OpenAi") |
|
task1 = Task("anthropic", "score", "Anthropic") |
|
task2 = Task("hf", "score", "HuggingFace") |
|
|
|
NUM_FEWSHOT = 0 |
|
|
|
|
|
|
|
|
|
|
|
TITLE = """<img src="https://iq.wiki/branding/downloadassets/logobrainaltwhite.png" width="50" height="50" style="display: block; margin-left: auto; margin-right: auto;"> |
|
<h1 align="center" id="space-title">Demo leaderboard</h1>""" |
|
|
|
|
|
INTRODUCTION_TEXT = """ |
|
This leaderboard evaluates AI models on solidity codes |
|
""" |
|
|
|
|
|
LLM_BENCHMARKS_TEXT = """ |
|
## How it works |
|
|
|
## Reproducibility |
|
To reproduce our results, here is the commands you can run: |
|
|
|
""" |
|
|
|
EVALUATION_QUEUE_TEXT = """ |
|
## Some good practices before submitting a model |
|
|
|
### 1) Make sure you can load your model and tokenizer using AutoClasses: |
|
```python |
|
from transformers import AutoConfig, AutoModel, AutoTokenizer |
|
config = AutoConfig.from_pretrained("your model name", revision=revision) |
|
model = AutoModel.from_pretrained("your model name", revision=revision) |
|
tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision) |
|
``` |
|
If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded. |
|
|
|
Note: make sure your model is public! |
|
Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted! |
|
|
|
### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index) |
|
It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`! |
|
|
|
### 3) Make sure your model has an open license! |
|
This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗 |
|
|
|
### 4) Fill up your model card |
|
When we add extra information about models to the leaderboard, it will be automatically taken from the model card |
|
|
|
## In case of model failure |
|
If your model is displayed in the `FAILED` category, its execution stopped. |
|
Make sure you have followed the above steps first. |
|
If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task). |
|
""" |
|
EVALUATION_SCRIPT = """" |
|
To evaluate the model you can access the colab notebook at [this link](https://colab.research.google.com/drive/145KAGvgdAb8BrkObUrxAVWBd9EGDqy8N?usp=sharing). |
|
|
|
First install the necessary libraries |
|
``` |
|
pip install accelerate openai anthropic datasets |
|
``` |
|
Setup your : |
|
* OPENAI_API_KEY |
|
* ANTHROPIC_API_KEY |
|
* HF_TOKEN |
|
|
|
Select a model |
|
```python |
|
MODEL_ID = # model_id_here |
|
``` |
|
Then run the following script |
|
```python |
|
from transformers import pipeline |
|
import torch |
|
import os |
|
import json |
|
from openai import OpenAI |
|
import anthropic |
|
from huggingface_hub.utils._token import get_token |
|
from huggingface_hub import InferenceClient |
|
HF_TOKEN = get_token() |
|
|
|
from datasets import load_dataset |
|
|
|
ds = load_dataset("braindao/solbench-naive-judge-random-v1",split="test") |
|
|
|
|
|
pipe = pipeline("text-generation", model= MODEL_ID , torch_dtype=torch.bfloat16, device_map="auto") |
|
|
|
def generate(message): |
|
messages = [ |
|
{"role": "user", "content": message}, |
|
] |
|
return pipe(messages,max_new_tokens=1024)[0]["generated_text"][1]["content"] |
|
|
|
def convert_to_int(text): |
|
value = 0 |
|
try : |
|
value = int(text) |
|
except : |
|
pass |
|
return value |
|
|
|
def anthropic_judge(code,baseline): |
|
prompt = f"""Analyze the provided Solidity code and assign a score from 0 to 10 based on these criteria: |
|
|
|
1. Functionality (0-2 points) |
|
2. Security (0-2 points) |
|
3. Efficiency (0-2 points) |
|
4. Readability and Style (0-2 points) |
|
5. Similarity with the Expert Code (0-2 points) |
|
|
|
We |
|
Evaluate the code thoroughly, sum up the points, and return ONLY an integer value representing the final score. Your entire response should consist of a single integer between 0 and 10, inclusive. |
|
|
|
Solidity code to evaluate: |
|
```solidity |
|
{code} |
|
``` |
|
|
|
Expert Code: |
|
```solidity |
|
{baseline} |
|
``` |
|
|
|
OUTPUT FORMAT: [integer]""" |
|
|
|
|
|
sys = """You are a solidity code judge, |
|
You will only reply with an integer value between 0-10""" |
|
|
|
client = anthropic.Anthropic() |
|
|
|
message = client.messages.create( |
|
model="claude-3-5-sonnet-20240620", |
|
max_tokens=1000, |
|
temperature=0, |
|
system=sys, |
|
messages=[ |
|
{ |
|
"role": "user", |
|
"content": [ |
|
{ |
|
"type": "text", |
|
"text": prompt |
|
} |
|
] |
|
} |
|
] |
|
) |
|
return convert_to_int(message.content[0].text) |
|
|
|
|
|
def openai_judge(code,baseline): |
|
prompt = f"""evaluate the following solidity code and return a score between 0 and 10 based how far the code achieves the following criteria: |
|
|
|
1. Functionality (0-2 points) |
|
2. Security (0-2 points) |
|
3. Efficiency (0-2 points) |
|
4. Readability and Style (0-2 points) |
|
5. Similarity with the Expert Code (0-2 points) |
|
|
|
code to evaluate: |
|
{code} |
|
|
|
expert code: |
|
{baseline} |
|
|
|
return only an integer value and no additional comment, score should be either 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 or 10. |
|
""" |
|
client = OpenAI() |
|
completion = client.chat.completions.create( |
|
model="gpt-4o", |
|
messages=[ |
|
{"role": "user", "content": prompt} |
|
] |
|
) |
|
return convert_to_int(completion.choices[0].message.content) |
|
|
|
|
|
def hf_judge(code,baseline): |
|
prompt = f"""evaluate the following solidity code and return a score between 0 and 10 based how far the code achieves the following criteria: |
|
|
|
1. Functionality (0-2 points) |
|
2. Security (0-2 points) |
|
3. Efficiency (0-2 points) |
|
4. Readability and Style (0-2 points) |
|
5. Similarity with the Expert Code (0-2 points) |
|
|
|
code to evaluate: |
|
{code} |
|
|
|
expert code: |
|
{baseline} |
|
|
|
return only an integer value and no additional comment, score should be either 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 or 10. |
|
""" |
|
client = InferenceClient( |
|
"meta-llama/Meta-Llama-3.1-405B-Instruct", |
|
token=HF_TOKEN, |
|
) |
|
out = "" |
|
try : |
|
for message in client.chat_completion( |
|
messages=[{"role":"system","content" : "you are a solidity code judge, you will only reply with an integer value between 0-10"}, |
|
{"role": "user", "content": prompt}], |
|
max_tokens=500, |
|
stream=True, |
|
): |
|
out += message.choices[0].delta.content |
|
except : |
|
pass |
|
return convert_to_int(out) |
|
|
|
def LLM_JUDGE(code,baseline,judges=["openai","anthropic","hf"]) : |
|
out = {} |
|
if "openai" in judges : |
|
out["openai"] = openai_judge(code,baseline) |
|
if "anthropic" in judges : |
|
out["anthropic"] = anthropic_judge(code,baseline) |
|
if "hf" in judges : |
|
out["hf"] = hf_judge(code,baseline) |
|
return out |
|
|
|
# Judge model against data |
|
from tqdm import tqdm |
|
scores = {"openai":[],"anthropic":[],"hf":[]} |
|
for sample in tqdm(ds) : |
|
score = evaluate_sample(sample) |
|
for key in score.keys(): |
|
scores[key].append(score[key]) |
|
|
|
# normalize scores |
|
for key in scores.keys(): |
|
scores[key] = sum(scores[key])/(10*len(scores[key])) |
|
|
|
|
|
d = { |
|
"config": { |
|
"model_dtype": "torch.bfloat16", |
|
"model_name": MODEL_ID, |
|
"model_sha": "main" |
|
}, |
|
"results": { |
|
"openai": { |
|
"score": 0 |
|
}, |
|
"anthropic": { |
|
"score": 0 |
|
}, |
|
"hf": { |
|
"score": 0 |
|
} |
|
} |
|
} |
|
|
|
for key in scores.keys() : |
|
d["results"][key]["score"] = scores[key] |
|
|
|
|
|
# Serializing json |
|
json_object = json.dumps(d, indent=4) |
|
|
|
# Writing to sample.json |
|
file_name = MODEL_ID.split("/")[1] + ".json" |
|
with open(file_name, "w") as outfile: |
|
outfile.write(json_object) |
|
|
|
``` |
|
|
|
if you are not part of braindao set `create_pr` to **True** |
|
```python |
|
from huggingface_hub import upload_file |
|
upload_file(path_or_fileobj = file_name, |
|
path_in_repo=f"{MODEL_ID}.json", |
|
repo_id="braindao/results", |
|
repo_type="dataset", |
|
create_pr=False) |
|
``` |
|
|
|
"""" |
|
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results" |
|
CITATION_BUTTON_TEXT = r""" |
|
""" |
|
|