Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
eduagarcia
commited on
Commit
β’
439afd4
1
Parent(s):
7a2430c
Fix model eval links and remove huggingface icon from Leaderboard name
Browse files- src/display/about.py +8 -7
- src/display/formatting.py +18 -3
- src/display/utils.py +1 -1
- src/leaderboard/read_evals.py +6 -1
- src/populate.py +3 -3
src/display/about.py
CHANGED
@@ -4,21 +4,22 @@ from src.envs import REPO_ID, QUEUE_REPO, RESULTS_REPO, PATH_TO_COLLECTION, LEAD
|
|
4 |
|
5 |
LM_EVAL_URL = "https://github.com/eduagarcia/lm-evaluation-harness-pt"
|
6 |
|
7 |
-
TITLE = F"""<h1 align="center" id="space-title"
|
8 |
|
9 |
INTRODUCTION_TEXT = f"""
|
10 |
-
π The
|
11 |
|
12 |
This is a fork of the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard" target="_blank">π€ Open LLM Leaderboard</a> with different benchmarks.
|
13 |
|
14 |
-
|
15 |
The leaderboard's backend runs the great [Eleuther AI Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness) - read more details in the "About" page!
|
16 |
"""
|
17 |
-
|
18 |
TASKS_LIST= ""
|
19 |
for task in Tasks:
|
20 |
task = task.value
|
21 |
TASKS_LIST += f'- <a href="{task.link}" target="_blank"> {task.col_name} </a> ({task.few_shot}-shot) - {task.description}\n'
|
|
|
22 |
|
23 |
TASKS_PARAMETERS = ""
|
24 |
for task in Tasks:
|
@@ -33,7 +34,7 @@ With the plethora of large language models (LLMs) and chatbots being released we
|
|
33 |
|
34 |
## How it works
|
35 |
|
36 |
-
π We evaluate models on
|
37 |
|
38 |
{TASKS_LIST}
|
39 |
|
@@ -133,9 +134,9 @@ I have an issue about accessing the leaderboard through the Gradio API
|
|
133 |
|
134 |
|
135 |
EVALUATION_QUEUE_TEXT = f"""
|
136 |
-
# Evaluation Queue for the
|
137 |
|
138 |
-
Models added here will be automatically evaluated on
|
139 |
|
140 |
## First steps before submitting a model
|
141 |
|
|
|
4 |
|
5 |
LM_EVAL_URL = "https://github.com/eduagarcia/lm-evaluation-harness-pt"
|
6 |
|
7 |
+
TITLE = F"""<h1 align="center" id="space-title">π {LEADERBOARD_NAME}</h1>"""
|
8 |
|
9 |
INTRODUCTION_TEXT = f"""
|
10 |
+
π The π {LEADERBOARD_NAME} aims to track, rank and evaluate open LLMs and chatbots.
|
11 |
|
12 |
This is a fork of the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard" target="_blank">π€ Open LLM Leaderboard</a> with different benchmarks.
|
13 |
|
14 |
+
Submit a model for automated evaluation on our GPU cluster on the "Submit" page!
|
15 |
The leaderboard's backend runs the great [Eleuther AI Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness) - read more details in the "About" page!
|
16 |
"""
|
17 |
+
task_count = 0
|
18 |
TASKS_LIST= ""
|
19 |
for task in Tasks:
|
20 |
task = task.value
|
21 |
TASKS_LIST += f'- <a href="{task.link}" target="_blank"> {task.col_name} </a> ({task.few_shot}-shot) - {task.description}\n'
|
22 |
+
task_count += 1
|
23 |
|
24 |
TASKS_PARAMETERS = ""
|
25 |
for task in Tasks:
|
|
|
34 |
|
35 |
## How it works
|
36 |
|
37 |
+
π We evaluate models on {task_count} key benchmarks using the <a href="https://github.com/EleutherAI/lm-evaluation-harness" target="_blank"> Eleuther AI Language Model Evaluation Harness </a>, a unified framework to test generative language models on a large number of different evaluation tasks.
|
38 |
|
39 |
{TASKS_LIST}
|
40 |
|
|
|
134 |
|
135 |
|
136 |
EVALUATION_QUEUE_TEXT = f"""
|
137 |
+
# Evaluation Queue for the π {LEADERBOARD_NAME}
|
138 |
|
139 |
+
Models added here will be automatically evaluated on our evaluation cluster.
|
140 |
|
141 |
## First steps before submitting a model
|
142 |
|
src/display/formatting.py
CHANGED
@@ -4,6 +4,7 @@ from datetime import datetime, timezone
|
|
4 |
from huggingface_hub import HfApi
|
5 |
from huggingface_hub.hf_api import ModelInfo
|
6 |
|
|
|
7 |
|
8 |
API = HfApi()
|
9 |
|
@@ -11,11 +12,25 @@ def model_hyperlink(link, model_name):
|
|
11 |
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
12 |
|
13 |
|
14 |
-
def
|
15 |
link = f"https://huggingface.co/{model_name}"
|
16 |
|
17 |
-
details_model_name = model_name.replace("/", "__")
|
18 |
-
details_link = f"https://huggingface.co/datasets/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
|
20 |
return model_hyperlink(link, model_name) + " " + model_hyperlink(details_link, "π")
|
21 |
|
|
|
4 |
from huggingface_hub import HfApi
|
5 |
from huggingface_hub.hf_api import ModelInfo
|
6 |
|
7 |
+
from src.envs import RESULTS_REPO, QUEUE_REPO
|
8 |
|
9 |
API = HfApi()
|
10 |
|
|
|
12 |
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
13 |
|
14 |
|
15 |
+
def make_requests_clickable_model(model_name, json_path=None):
|
16 |
link = f"https://huggingface.co/{model_name}"
|
17 |
|
18 |
+
#details_model_name = model_name.replace("/", "__")
|
19 |
+
details_link = f"https://huggingface.co/datasets/{QUEUE_REPO}/tree/main"
|
20 |
+
if '/' in model_name:
|
21 |
+
details_link += f"/{model_name.split('/')[0]}"
|
22 |
+
if json_path is not None:
|
23 |
+
details_link = f"https://huggingface.co/datasets/{QUEUE_REPO}/blob/main/{json_path}"
|
24 |
+
|
25 |
+
return model_hyperlink(link, model_name) + " " + model_hyperlink(details_link, "π")
|
26 |
+
|
27 |
+
def make_clickable_model(model_name, json_path=None):
|
28 |
+
link = f"https://huggingface.co/{model_name}"
|
29 |
+
|
30 |
+
#details_model_name = model_name.replace("/", "__")
|
31 |
+
details_link = f"https://huggingface.co/datasets/{RESULTS_REPO}/tree/main/{model_name}"
|
32 |
+
if json_path is not None:
|
33 |
+
details_link = f"https://huggingface.co/datasets/{RESULTS_REPO}/blob/main/{model_name}/{json_path}"
|
34 |
|
35 |
return model_hyperlink(link, model_name) + " " + model_hyperlink(details_link, "π")
|
36 |
|
src/display/utils.py
CHANGED
@@ -31,7 +31,7 @@ class Tasks(Enum):
|
|
31 |
limit=None,
|
32 |
task_list=["oab_exams_generate"],
|
33 |
link="https://huggingface.co/datasets/eduagarcia/oab_exams",
|
34 |
-
description="OAB Exams is a dataset of
|
35 |
)
|
36 |
brazilian_court_decisions_judgment = Task(
|
37 |
benchmark="brazilian_court_decisions_judgment",
|
|
|
31 |
limit=None,
|
32 |
task_list=["oab_exams_generate"],
|
33 |
link="https://huggingface.co/datasets/eduagarcia/oab_exams",
|
34 |
+
description="OAB Exams is a dataset of 2,000 questions from the Brazilian Bar Association's exams."
|
35 |
)
|
36 |
brazilian_court_decisions_judgment = Task(
|
37 |
benchmark="brazilian_court_decisions_judgment",
|
src/leaderboard/read_evals.py
CHANGED
@@ -35,6 +35,7 @@ class EvalResult:
|
|
35 |
flagged: bool = False
|
36 |
status: str = "FINISHED"
|
37 |
tags: list = None
|
|
|
38 |
|
39 |
@classmethod
|
40 |
def init_from_json_file(self, json_filepath):
|
@@ -42,6 +43,8 @@ class EvalResult:
|
|
42 |
with open(json_filepath) as fp:
|
43 |
data = json.load(fp)
|
44 |
|
|
|
|
|
45 |
# We manage the legacy config format
|
46 |
config = data.get("config_general")
|
47 |
|
@@ -100,6 +103,7 @@ class EvalResult:
|
|
100 |
results=results,
|
101 |
precision=precision,
|
102 |
revision= config.get("model_sha", ""),
|
|
|
103 |
)
|
104 |
|
105 |
def update_with_request_file(self, requests_path):
|
@@ -137,7 +141,7 @@ class EvalResult:
|
|
137 |
AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
|
138 |
AutoEvalColumn.weight_type.name: self.weight_type.value.name,
|
139 |
AutoEvalColumn.architecture.name: self.architecture,
|
140 |
-
AutoEvalColumn.model.name: make_clickable_model(self.full_model),
|
141 |
AutoEvalColumn.dummy.name: self.full_model,
|
142 |
AutoEvalColumn.revision.name: self.revision,
|
143 |
AutoEvalColumn.average.name: average,
|
@@ -202,6 +206,7 @@ def get_raw_eval_results(results_path: str, requests_path: str, dynamic_path: st
|
|
202 |
eval_results = {}
|
203 |
for model_result_filepath in model_result_filepaths:
|
204 |
# Creation of result
|
|
|
205 |
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
206 |
eval_result.update_with_request_file(requests_path)
|
207 |
if eval_result.full_model in dynamic_data:
|
|
|
35 |
flagged: bool = False
|
36 |
status: str = "FINISHED"
|
37 |
tags: list = None
|
38 |
+
json_filename: str = None
|
39 |
|
40 |
@classmethod
|
41 |
def init_from_json_file(self, json_filepath):
|
|
|
43 |
with open(json_filepath) as fp:
|
44 |
data = json.load(fp)
|
45 |
|
46 |
+
json_filename = os.path.basename(json_filepath)
|
47 |
+
|
48 |
# We manage the legacy config format
|
49 |
config = data.get("config_general")
|
50 |
|
|
|
103 |
results=results,
|
104 |
precision=precision,
|
105 |
revision= config.get("model_sha", ""),
|
106 |
+
json_filename=json_filename
|
107 |
)
|
108 |
|
109 |
def update_with_request_file(self, requests_path):
|
|
|
141 |
AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
|
142 |
AutoEvalColumn.weight_type.name: self.weight_type.value.name,
|
143 |
AutoEvalColumn.architecture.name: self.architecture,
|
144 |
+
AutoEvalColumn.model.name: make_clickable_model(self.full_model, self.json_filename),
|
145 |
AutoEvalColumn.dummy.name: self.full_model,
|
146 |
AutoEvalColumn.revision.name: self.revision,
|
147 |
AutoEvalColumn.average.name: average,
|
|
|
206 |
eval_results = {}
|
207 |
for model_result_filepath in model_result_filepaths:
|
208 |
# Creation of result
|
209 |
+
print(model_result_filepath)
|
210 |
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
211 |
eval_result.update_with_request_file(requests_path)
|
212 |
if eval_result.full_model in dynamic_data:
|
src/populate.py
CHANGED
@@ -3,7 +3,7 @@ import os
|
|
3 |
|
4 |
import pandas as pd
|
5 |
|
6 |
-
from src.display.formatting import has_no_nan_values,
|
7 |
from src.display.utils import AutoEvalColumn, EvalQueueColumn, baseline_row
|
8 |
from src.leaderboard.filter_models import filter_models
|
9 |
from src.leaderboard.read_evals import get_raw_eval_results
|
@@ -35,7 +35,7 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
|
35 |
with open(file_path) as fp:
|
36 |
data = json.load(fp)
|
37 |
|
38 |
-
data[EvalQueueColumn.model.name] =
|
39 |
data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
40 |
|
41 |
all_evals.append(data)
|
@@ -47,7 +47,7 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
|
47 |
with open(file_path) as fp:
|
48 |
data = json.load(fp)
|
49 |
|
50 |
-
data[EvalQueueColumn.model.name] =
|
51 |
data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
52 |
all_evals.append(data)
|
53 |
|
|
|
3 |
|
4 |
import pandas as pd
|
5 |
|
6 |
+
from src.display.formatting import has_no_nan_values, make_requests_clickable_model
|
7 |
from src.display.utils import AutoEvalColumn, EvalQueueColumn, baseline_row
|
8 |
from src.leaderboard.filter_models import filter_models
|
9 |
from src.leaderboard.read_evals import get_raw_eval_results
|
|
|
35 |
with open(file_path) as fp:
|
36 |
data = json.load(fp)
|
37 |
|
38 |
+
data[EvalQueueColumn.model.name] = make_requests_clickable_model(data["model"], entry)
|
39 |
data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
40 |
|
41 |
all_evals.append(data)
|
|
|
47 |
with open(file_path) as fp:
|
48 |
data = json.load(fp)
|
49 |
|
50 |
+
data[EvalQueueColumn.model.name] = make_requests_clickable_model(data["model"], os.path.join(entry, sub_entry))
|
51 |
data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
52 |
all_evals.append(data)
|
53 |
|