Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Apply pre-commit
Browse files- README.md +1 -1
- src/about.py +12 -4
- src/display/utils.py +13 -4
- src/envs.py +3 -3
- src/leaderboard/read_evals.py +24 -36
- src/submission/check_validity.py +20 -12
- src/submission/submit.py +3 -7
README.md
CHANGED
@@ -39,7 +39,7 @@ If you encounter problem on the space, don't hesitate to restart it to remove th
|
|
39 |
|
40 |
# Code logic for more complex edits
|
41 |
|
42 |
-
You'll find
|
43 |
- the main table' columns names and properties in `src/display/utils.py`
|
44 |
- the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
|
45 |
- teh logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`
|
|
|
39 |
|
40 |
# Code logic for more complex edits
|
41 |
|
42 |
+
You'll find
|
43 |
- the main table' columns names and properties in `src/display/utils.py`
|
44 |
- the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
|
45 |
- teh logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`
|
src/about.py
CHANGED
@@ -41,8 +41,12 @@ class Tasks(Enum):
|
|
41 |
jsem_exact_match = Task("scores", "jsem_exact_match", "JSEM")
|
42 |
jsick_exact_match = Task("scores", "jsick_exact_match", "JSICK")
|
43 |
jsquad_char_f1 = Task("scores", "jsquad_char_f1", "JSquad")
|
44 |
-
jsts_pearson = Task(
|
45 |
-
|
|
|
|
|
|
|
|
|
46 |
kuci_exact_match = Task("scores", "kuci_exact_match", "KUCI")
|
47 |
mawps_exact_match = Task("scores", "mawps_exact_match", "MAWPS")
|
48 |
mmlu_en_exact_match = Task("scores", "mmlu_en_exact_match", "MMLU")
|
@@ -52,10 +56,14 @@ class Tasks(Enum):
|
|
52 |
wiki_ner_set_f1 = Task("scores", "wiki_ner_set_f1", "Wiki NER")
|
53 |
wiki_pas_set_f1 = Task("scores", "wiki_pas_set_f1", "Wiki PAS")
|
54 |
wiki_reading_char_f1 = Task("scores", "wiki_reading_char_f1", "Wiki Reading")
|
55 |
-
wikicorpus_e_to_j_bert_score_ja_f1 = Task(
|
|
|
|
|
56 |
wikicorpus_e_to_j_bleu_ja = Task("scores", "wikicorpus-e-to-j_bleu_ja", "WikiCorpus E to J BLEU")
|
57 |
wikicorpus_e_to_j_comet_wmt22 = Task("scores", "wikicorpus-e-to-j_comet_wmt22", "WikiCorpus E to J COMET WMT22")
|
58 |
-
wikicorpus_j_to_e_bert_score_en_f1 = Task(
|
|
|
|
|
59 |
wikicorpus_j_to_e_bleu_en = Task("scores", "wikicorpus-j-to-e_bleu_en", "WikiCorpus J to E BLEU")
|
60 |
wikicorpus_j_to_e_comet_wmt22 = Task("scores", "wikicorpus-j-to-e_comet_wmt22", "WikiCorpus J to E COMET WMT22")
|
61 |
xlsum_ja_bert_score_ja_f1 = Task("scores", "xlsum_ja_bert_score_ja_f1", "XL-Sum JA BERT Score")
|
|
|
41 |
jsem_exact_match = Task("scores", "jsem_exact_match", "JSEM")
|
42 |
jsick_exact_match = Task("scores", "jsick_exact_match", "JSICK")
|
43 |
jsquad_char_f1 = Task("scores", "jsquad_char_f1", "JSquad")
|
44 |
+
jsts_pearson = Task(
|
45 |
+
"scores", "jsts_pearson", "JSTS (Pearson) - 意味的類似度"
|
46 |
+
) # Semantic Textual Similarity - 意味的類似度
|
47 |
+
jsts_spearman = Task(
|
48 |
+
"scores", "jsts_spearman", "JSTS (Spearman) - 意味的類似度"
|
49 |
+
) # Semantic Textual Similarity - 意味的類似度
|
50 |
kuci_exact_match = Task("scores", "kuci_exact_match", "KUCI")
|
51 |
mawps_exact_match = Task("scores", "mawps_exact_match", "MAWPS")
|
52 |
mmlu_en_exact_match = Task("scores", "mmlu_en_exact_match", "MMLU")
|
|
|
56 |
wiki_ner_set_f1 = Task("scores", "wiki_ner_set_f1", "Wiki NER")
|
57 |
wiki_pas_set_f1 = Task("scores", "wiki_pas_set_f1", "Wiki PAS")
|
58 |
wiki_reading_char_f1 = Task("scores", "wiki_reading_char_f1", "Wiki Reading")
|
59 |
+
wikicorpus_e_to_j_bert_score_ja_f1 = Task(
|
60 |
+
"scores", "wikicorpus-e-to-j_bert_score_ja_f1", "WikiCorpus E to J BERT Score"
|
61 |
+
)
|
62 |
wikicorpus_e_to_j_bleu_ja = Task("scores", "wikicorpus-e-to-j_bleu_ja", "WikiCorpus E to J BLEU")
|
63 |
wikicorpus_e_to_j_comet_wmt22 = Task("scores", "wikicorpus-e-to-j_comet_wmt22", "WikiCorpus E to J COMET WMT22")
|
64 |
+
wikicorpus_j_to_e_bert_score_en_f1 = Task(
|
65 |
+
"scores", "wikicorpus-j-to-e_bert_score_en_f1", "WikiCorpus J to E BERT Score"
|
66 |
+
)
|
67 |
wikicorpus_j_to_e_bleu_en = Task("scores", "wikicorpus-j-to-e_bleu_en", "WikiCorpus J to E BLEU")
|
68 |
wikicorpus_j_to_e_comet_wmt22 = Task("scores", "wikicorpus-j-to-e_comet_wmt22", "WikiCorpus J to E COMET WMT22")
|
69 |
xlsum_ja_bert_score_ja_f1 = Task("scores", "xlsum_ja_bert_score_ja_f1", "XL-Sum JA BERT Score")
|
src/display/utils.py
CHANGED
@@ -5,6 +5,7 @@ import pandas as pd
|
|
5 |
|
6 |
from src.about import Tasks
|
7 |
|
|
|
8 |
def fields(raw_class):
|
9 |
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
10 |
|
@@ -21,12 +22,13 @@ class ColumnContent:
|
|
21 |
never_hidden: bool = False
|
22 |
dummy: bool = False
|
23 |
|
|
|
24 |
## Leaderboard columns
|
25 |
auto_eval_column_dict = []
|
26 |
# Init
|
27 |
auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
28 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
29 |
-
#Scores
|
30 |
# auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
31 |
for task in Tasks:
|
32 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
@@ -47,6 +49,7 @@ auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_
|
|
47 |
# We use make dataclass to dynamically fill the scores from Tasks
|
48 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
49 |
|
|
|
50 |
## For the queue columns in the submission tab
|
51 |
@dataclass(frozen=True)
|
52 |
class EvalQueueColumn: # Queue column
|
@@ -57,12 +60,13 @@ class EvalQueueColumn: # Queue column
|
|
57 |
weight_type = ColumnContent("weight_type", "str", "Original")
|
58 |
status = ColumnContent("status", "str", True)
|
59 |
|
|
|
60 |
## All the model information that we might need
|
61 |
@dataclass
|
62 |
class ModelDetails:
|
63 |
name: str
|
64 |
display_name: str = ""
|
65 |
-
symbol: str = ""
|
66 |
|
67 |
|
68 |
class ModelType(Enum):
|
@@ -87,11 +91,13 @@ class ModelType(Enum):
|
|
87 |
return ModelType.IFT
|
88 |
return ModelType.Unknown
|
89 |
|
|
|
90 |
class WeightType(Enum):
|
91 |
Adapter = ModelDetails("Adapter")
|
92 |
Original = ModelDetails("Original")
|
93 |
Delta = ModelDetails("Delta")
|
94 |
|
|
|
95 |
class Precision(Enum):
|
96 |
float16 = ModelDetails("float16")
|
97 |
bfloat16 = ModelDetails("bfloat16")
|
@@ -104,23 +110,26 @@ class Precision(Enum):
|
|
104 |
return Precision.bfloat16
|
105 |
return Precision.Unknown
|
106 |
|
|
|
107 |
class AddSpecialTokens(Enum):
|
108 |
true = ModelDetails("True")
|
109 |
false = ModelDetails("False")
|
110 |
Unknown = ModelDetails("?")
|
111 |
|
|
|
112 |
class NumFewShots(Enum):
|
113 |
shots_0 = ModelDetails("0")
|
114 |
shots_4 = ModelDetails("4")
|
115 |
Unknown = ModelDetails("?")
|
116 |
|
117 |
def from_str(shots):
|
118 |
-
if shots==
|
119 |
return NumFewShots.shots_0
|
120 |
-
if shots==
|
121 |
return NumFewShots.shots_4
|
122 |
return NumFewShots.Unknown
|
123 |
|
|
|
124 |
# Column selection
|
125 |
COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
|
126 |
TYPES = [c.type for c in fields(AutoEvalColumn)]
|
|
|
5 |
|
6 |
from src.about import Tasks
|
7 |
|
8 |
+
|
9 |
def fields(raw_class):
|
10 |
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
11 |
|
|
|
22 |
never_hidden: bool = False
|
23 |
dummy: bool = False
|
24 |
|
25 |
+
|
26 |
## Leaderboard columns
|
27 |
auto_eval_column_dict = []
|
28 |
# Init
|
29 |
auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
30 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
31 |
+
# Scores
|
32 |
# auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
33 |
for task in Tasks:
|
34 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
|
|
49 |
# We use make dataclass to dynamically fill the scores from Tasks
|
50 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
51 |
|
52 |
+
|
53 |
## For the queue columns in the submission tab
|
54 |
@dataclass(frozen=True)
|
55 |
class EvalQueueColumn: # Queue column
|
|
|
60 |
weight_type = ColumnContent("weight_type", "str", "Original")
|
61 |
status = ColumnContent("status", "str", True)
|
62 |
|
63 |
+
|
64 |
## All the model information that we might need
|
65 |
@dataclass
|
66 |
class ModelDetails:
|
67 |
name: str
|
68 |
display_name: str = ""
|
69 |
+
symbol: str = "" # emoji
|
70 |
|
71 |
|
72 |
class ModelType(Enum):
|
|
|
91 |
return ModelType.IFT
|
92 |
return ModelType.Unknown
|
93 |
|
94 |
+
|
95 |
class WeightType(Enum):
|
96 |
Adapter = ModelDetails("Adapter")
|
97 |
Original = ModelDetails("Original")
|
98 |
Delta = ModelDetails("Delta")
|
99 |
|
100 |
+
|
101 |
class Precision(Enum):
|
102 |
float16 = ModelDetails("float16")
|
103 |
bfloat16 = ModelDetails("bfloat16")
|
|
|
110 |
return Precision.bfloat16
|
111 |
return Precision.Unknown
|
112 |
|
113 |
+
|
114 |
class AddSpecialTokens(Enum):
|
115 |
true = ModelDetails("True")
|
116 |
false = ModelDetails("False")
|
117 |
Unknown = ModelDetails("?")
|
118 |
|
119 |
+
|
120 |
class NumFewShots(Enum):
|
121 |
shots_0 = ModelDetails("0")
|
122 |
shots_4 = ModelDetails("4")
|
123 |
Unknown = ModelDetails("?")
|
124 |
|
125 |
def from_str(shots):
|
126 |
+
if shots == "0":
|
127 |
return NumFewShots.shots_0
|
128 |
+
if shots == "4":
|
129 |
return NumFewShots.shots_4
|
130 |
return NumFewShots.Unknown
|
131 |
|
132 |
+
|
133 |
# Column selection
|
134 |
COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
|
135 |
TYPES = [c.type for c in fields(AutoEvalColumn)]
|
src/envs.py
CHANGED
@@ -4,9 +4,9 @@ from huggingface_hub import HfApi
|
|
4 |
|
5 |
# Info to change for your repository
|
6 |
# ----------------------------------
|
7 |
-
TOKEN = os.environ.get("HF_TOKEN")
|
8 |
|
9 |
-
OWNER = "llm-jp"
|
10 |
# ----------------------------------
|
11 |
|
12 |
REPO_ID = f"{OWNER}/open-japanese-llm-leaderboard"
|
@@ -14,7 +14,7 @@ QUEUE_REPO = f"{OWNER}/requests"
|
|
14 |
RESULTS_REPO = f"{OWNER}/results"
|
15 |
|
16 |
# If you setup a cache later, just change HF_HOME
|
17 |
-
CACHE_PATH=os.getenv("HF_HOME", ".")
|
18 |
|
19 |
# Local caches
|
20 |
EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
|
|
|
4 |
|
5 |
# Info to change for your repository
|
6 |
# ----------------------------------
|
7 |
+
TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
|
8 |
|
9 |
+
OWNER = "llm-jp" # Change to your org - don't forget to create a results and request dataset, with the correct format!
|
10 |
# ----------------------------------
|
11 |
|
12 |
REPO_ID = f"{OWNER}/open-japanese-llm-leaderboard"
|
|
|
14 |
RESULTS_REPO = f"{OWNER}/results"
|
15 |
|
16 |
# If you setup a cache later, just change HF_HOME
|
17 |
+
CACHE_PATH = os.getenv("HF_HOME", ".")
|
18 |
|
19 |
# Local caches
|
20 |
EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
|
src/leaderboard/read_evals.py
CHANGED
@@ -1,37 +1,36 @@
|
|
1 |
import glob
|
2 |
import json
|
3 |
-
import math
|
4 |
import os
|
5 |
from dataclasses import dataclass
|
6 |
-
import dateutil
|
7 |
-
import numpy as np
|
8 |
from decimal import Decimal
|
9 |
|
|
|
|
|
10 |
from src.display.formatting import make_clickable_model
|
11 |
-
from src.display.utils import AutoEvalColumn, ModelType, Tasks,
|
12 |
from src.submission.check_validity import is_model_on_hub
|
13 |
|
14 |
|
15 |
@dataclass
|
16 |
class EvalResult:
|
17 |
-
"""Represents one full evaluation. Built from a combination of the result and request file for a given run.
|
18 |
-
|
19 |
-
eval_name: str
|
20 |
-
full_model: str
|
21 |
-
org: str
|
22 |
model: str
|
23 |
-
revision: str
|
24 |
results: dict
|
25 |
# precision: Precision = Precision.Unknown
|
26 |
-
model_type: ModelType = ModelType.Unknown
|
27 |
precision: str = "Unknown"
|
28 |
# model_type: str = "Unknown"
|
29 |
-
weight_type: WeightType = WeightType.Original
|
30 |
-
architecture: str = "Unknown"
|
31 |
license: str = "?"
|
32 |
likes: int = 0
|
33 |
num_params: int = 0
|
34 |
-
date: str = ""
|
35 |
still_on_hub: bool = False
|
36 |
num_few_shots: str = "0"
|
37 |
add_special_tokens: str = ""
|
@@ -47,7 +46,7 @@ class EvalResult:
|
|
47 |
model_config = config.get("model", {})
|
48 |
|
49 |
# Get model type from metainfo
|
50 |
-
# model_type_str = metainfo.get("model_type", "")
|
51 |
# model_type = ModelType.from_str(model_type_str)
|
52 |
# model_type = metainfo.get("model_type", "Unknown")
|
53 |
|
@@ -59,13 +58,15 @@ class EvalResult:
|
|
59 |
precision = model_config.get("dtype", "Unknown")
|
60 |
|
61 |
# Add Special Tokens
|
62 |
-
add_special_tokens = str(
|
|
|
|
|
63 |
|
64 |
# Get model and org
|
65 |
# org_and_model = config.get("model_name", config.get("offline_inference").get("model_name", None))
|
66 |
org_and_model = config.get("model_name", config.get("offline_inference", {}).get("model_name", "Unknown"))
|
67 |
org_and_model = org_and_model.split("/", 1)
|
68 |
-
|
69 |
# org_and_modelがリストの場合、"/"で結合
|
70 |
if isinstance(org_and_model, list):
|
71 |
full_model = "/".join(org_and_model)
|
@@ -92,7 +93,7 @@ class EvalResult:
|
|
92 |
architectures = getattr(model_config, "architectures", None)
|
93 |
if architectures:
|
94 |
architecture = ";".join(architectures)
|
95 |
-
|
96 |
if "scores" not in data:
|
97 |
raise KeyError(f"'scores' key not found in JSON file: {json_filepath}")
|
98 |
|
@@ -103,7 +104,6 @@ class EvalResult:
|
|
103 |
score = scores.get(task_value.metric)
|
104 |
results[task_value.metric] = score
|
105 |
|
106 |
-
|
107 |
return self(
|
108 |
eval_name=result_key,
|
109 |
full_model=full_model,
|
@@ -121,12 +121,6 @@ class EvalResult:
|
|
121 |
def update_with_request_file(self, requests_path):
|
122 |
"""Finds the relevant request file for the current model and updates info with it"""
|
123 |
request_file = get_request_file_for_model(requests_path, self.full_model, self.precision)
|
124 |
-
if request_file:
|
125 |
-
with open(request_file, "r") as f:
|
126 |
-
request_data = json.load(f)
|
127 |
-
else:
|
128 |
-
print("No request file found.")
|
129 |
-
|
130 |
try:
|
131 |
with open(request_file, "r") as f:
|
132 |
request = json.load(f)
|
@@ -186,17 +180,15 @@ def get_request_file_for_model(requests_path, model_name, precision):
|
|
186 |
for tmp_request_file in request_files:
|
187 |
with open(tmp_request_file, "r") as f:
|
188 |
req_content = json.load(f)
|
189 |
-
if (
|
190 |
-
req_content["status"] in ["FINISHED"]
|
191 |
-
and req_content["precision"] == precision.split(".")[-1]
|
192 |
-
):
|
193 |
request_file = tmp_request_file
|
194 |
return request_file
|
195 |
|
|
|
196 |
def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
|
197 |
"""From the path of the results folder root, extract all needed info for results"""
|
198 |
model_result_filepaths = []
|
199 |
-
|
200 |
for root, _, files in os.walk(results_path):
|
201 |
# We should only have json files in model results
|
202 |
if len(files) == 0 or any([not f.endswith(".json") for f in files]):
|
@@ -210,7 +202,6 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
210 |
|
211 |
for file in files:
|
212 |
model_result_filepaths.append(os.path.join(root, file))
|
213 |
-
|
214 |
|
215 |
eval_results = {}
|
216 |
for model_result_filepath in model_result_filepaths:
|
@@ -225,17 +216,14 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
225 |
else:
|
226 |
eval_results[eval_name] = eval_result
|
227 |
|
228 |
-
data_dict = eval_result.to_dict()
|
229 |
-
|
230 |
results = []
|
231 |
for v in eval_results.values():
|
232 |
try:
|
233 |
-
v.to_dict()
|
234 |
results.append(v)
|
235 |
except KeyError: # not all eval values present
|
236 |
continue
|
237 |
# print(f"Processing file: {model_result_filepath}")
|
238 |
# print(f"Eval result: {eval_result.to_dict()}")
|
239 |
|
240 |
-
|
241 |
-
return results
|
|
|
1 |
import glob
|
2 |
import json
|
|
|
3 |
import os
|
4 |
from dataclasses import dataclass
|
|
|
|
|
5 |
from decimal import Decimal
|
6 |
|
7 |
+
import dateutil
|
8 |
+
|
9 |
from src.display.formatting import make_clickable_model
|
10 |
+
from src.display.utils import AutoEvalColumn, ModelType, Tasks, WeightType
|
11 |
from src.submission.check_validity import is_model_on_hub
|
12 |
|
13 |
|
14 |
@dataclass
|
15 |
class EvalResult:
|
16 |
+
"""Represents one full evaluation. Built from a combination of the result and request file for a given run."""
|
17 |
+
|
18 |
+
eval_name: str # org_model_precision (uid)
|
19 |
+
full_model: str # org/model (path on hub)
|
20 |
+
org: str
|
21 |
model: str
|
22 |
+
revision: str # commit hash, "" if main
|
23 |
results: dict
|
24 |
# precision: Precision = Precision.Unknown
|
25 |
+
model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
|
26 |
precision: str = "Unknown"
|
27 |
# model_type: str = "Unknown"
|
28 |
+
weight_type: WeightType = WeightType.Original # Original or Adapter
|
29 |
+
architecture: str = "Unknown"
|
30 |
license: str = "?"
|
31 |
likes: int = 0
|
32 |
num_params: int = 0
|
33 |
+
date: str = "" # submission date of request file
|
34 |
still_on_hub: bool = False
|
35 |
num_few_shots: str = "0"
|
36 |
add_special_tokens: str = ""
|
|
|
46 |
model_config = config.get("model", {})
|
47 |
|
48 |
# Get model type from metainfo
|
49 |
+
# model_type_str = metainfo.get("model_type", "")
|
50 |
# model_type = ModelType.from_str(model_type_str)
|
51 |
# model_type = metainfo.get("model_type", "Unknown")
|
52 |
|
|
|
58 |
precision = model_config.get("dtype", "Unknown")
|
59 |
|
60 |
# Add Special Tokens
|
61 |
+
add_special_tokens = str(
|
62 |
+
config.get("pipeline_kwargs", {"add_special_tokens": "Unknown"}).get("add_special_tokens")
|
63 |
+
)
|
64 |
|
65 |
# Get model and org
|
66 |
# org_and_model = config.get("model_name", config.get("offline_inference").get("model_name", None))
|
67 |
org_and_model = config.get("model_name", config.get("offline_inference", {}).get("model_name", "Unknown"))
|
68 |
org_and_model = org_and_model.split("/", 1)
|
69 |
+
|
70 |
# org_and_modelがリストの場合、"/"で結合
|
71 |
if isinstance(org_and_model, list):
|
72 |
full_model = "/".join(org_and_model)
|
|
|
93 |
architectures = getattr(model_config, "architectures", None)
|
94 |
if architectures:
|
95 |
architecture = ";".join(architectures)
|
96 |
+
|
97 |
if "scores" not in data:
|
98 |
raise KeyError(f"'scores' key not found in JSON file: {json_filepath}")
|
99 |
|
|
|
104 |
score = scores.get(task_value.metric)
|
105 |
results[task_value.metric] = score
|
106 |
|
|
|
107 |
return self(
|
108 |
eval_name=result_key,
|
109 |
full_model=full_model,
|
|
|
121 |
def update_with_request_file(self, requests_path):
|
122 |
"""Finds the relevant request file for the current model and updates info with it"""
|
123 |
request_file = get_request_file_for_model(requests_path, self.full_model, self.precision)
|
|
|
|
|
|
|
|
|
|
|
|
|
124 |
try:
|
125 |
with open(request_file, "r") as f:
|
126 |
request = json.load(f)
|
|
|
180 |
for tmp_request_file in request_files:
|
181 |
with open(tmp_request_file, "r") as f:
|
182 |
req_content = json.load(f)
|
183 |
+
if req_content["status"] in ["FINISHED"] and req_content["precision"] == precision.split(".")[-1]:
|
|
|
|
|
|
|
184 |
request_file = tmp_request_file
|
185 |
return request_file
|
186 |
|
187 |
+
|
188 |
def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
|
189 |
"""From the path of the results folder root, extract all needed info for results"""
|
190 |
model_result_filepaths = []
|
191 |
+
|
192 |
for root, _, files in os.walk(results_path):
|
193 |
# We should only have json files in model results
|
194 |
if len(files) == 0 or any([not f.endswith(".json") for f in files]):
|
|
|
202 |
|
203 |
for file in files:
|
204 |
model_result_filepaths.append(os.path.join(root, file))
|
|
|
205 |
|
206 |
eval_results = {}
|
207 |
for model_result_filepath in model_result_filepaths:
|
|
|
216 |
else:
|
217 |
eval_results[eval_name] = eval_result
|
218 |
|
|
|
|
|
219 |
results = []
|
220 |
for v in eval_results.values():
|
221 |
try:
|
222 |
+
v.to_dict() # we test if the dict version is complete
|
223 |
results.append(v)
|
224 |
except KeyError: # not all eval values present
|
225 |
continue
|
226 |
# print(f"Processing file: {model_result_filepath}")
|
227 |
# print(f"Eval result: {eval_result.to_dict()}")
|
228 |
|
229 |
+
return results
|
|
src/submission/check_validity.py
CHANGED
@@ -1,8 +1,6 @@
|
|
1 |
import json
|
2 |
import os
|
3 |
-
import re
|
4 |
from collections import defaultdict
|
5 |
-
from datetime import datetime, timedelta, timezone
|
6 |
|
7 |
import huggingface_hub
|
8 |
from huggingface_hub import ModelCard
|
@@ -10,6 +8,7 @@ from huggingface_hub.hf_api import ModelInfo
|
|
10 |
from transformers import AutoConfig
|
11 |
from transformers.models.auto.tokenization_auto import AutoTokenizer
|
12 |
|
|
|
13 |
def check_model_card(repo_id: str) -> tuple[bool, str]:
|
14 |
"""Checks if the model card and license exist and have been filled"""
|
15 |
try:
|
@@ -31,31 +30,38 @@ def check_model_card(repo_id: str) -> tuple[bool, str]:
|
|
31 |
|
32 |
return True, ""
|
33 |
|
34 |
-
|
|
|
|
|
|
|
35 |
"""Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses."""
|
36 |
try:
|
37 |
-
config = AutoConfig.from_pretrained(
|
|
|
|
|
38 |
if test_tokenizer:
|
39 |
try:
|
40 |
-
|
|
|
|
|
41 |
except ValueError as e:
|
|
|
|
|
42 |
return (
|
43 |
False,
|
44 |
-
|
45 |
-
None
|
46 |
)
|
47 |
-
except Exception as e:
|
48 |
-
return (False, "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?", None)
|
49 |
return True, None, config
|
50 |
|
51 |
except ValueError:
|
52 |
return (
|
53 |
False,
|
54 |
"needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
|
55 |
-
None
|
56 |
)
|
57 |
|
58 |
-
except Exception
|
59 |
return False, "was not found on hub!", None
|
60 |
|
61 |
|
@@ -70,10 +76,12 @@ def get_model_size(model_info: ModelInfo, precision: str):
|
|
70 |
model_size = size_factor * model_size
|
71 |
return model_size
|
72 |
|
|
|
73 |
def get_model_arch(model_info: ModelInfo):
|
74 |
"""Gets the model architecture from the configuration"""
|
75 |
return model_info.config.get("architectures", "Unknown")
|
76 |
|
|
|
77 |
def already_submitted_models(requested_models_dir: str) -> set[str]:
|
78 |
"""Gather a list of already submitted models to avoid duplicates"""
|
79 |
depth = 1
|
@@ -88,7 +96,7 @@ def already_submitted_models(requested_models_dir: str) -> set[str]:
|
|
88 |
continue
|
89 |
with open(os.path.join(root, file), "r") as f:
|
90 |
info = json.load(f)
|
91 |
-
if info[
|
92 |
continue
|
93 |
file_names.append(f"{info['model']}_{info['precision']}_{info['add_special_tokens']}")
|
94 |
|
|
|
1 |
import json
|
2 |
import os
|
|
|
3 |
from collections import defaultdict
|
|
|
4 |
|
5 |
import huggingface_hub
|
6 |
from huggingface_hub import ModelCard
|
|
|
8 |
from transformers import AutoConfig
|
9 |
from transformers.models.auto.tokenization_auto import AutoTokenizer
|
10 |
|
11 |
+
|
12 |
def check_model_card(repo_id: str) -> tuple[bool, str]:
|
13 |
"""Checks if the model card and license exist and have been filled"""
|
14 |
try:
|
|
|
30 |
|
31 |
return True, ""
|
32 |
|
33 |
+
|
34 |
+
def is_model_on_hub(
|
35 |
+
model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False
|
36 |
+
) -> tuple[bool, str]:
|
37 |
"""Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses."""
|
38 |
try:
|
39 |
+
config = AutoConfig.from_pretrained(
|
40 |
+
model_name, revision=revision, trust_remote_code=trust_remote_code, token=token
|
41 |
+
)
|
42 |
if test_tokenizer:
|
43 |
try:
|
44 |
+
AutoTokenizer.from_pretrained(
|
45 |
+
model_name, revision=revision, trust_remote_code=trust_remote_code, token=token
|
46 |
+
)
|
47 |
except ValueError as e:
|
48 |
+
return (False, f"uses a tokenizer which is not in a transformers release: {e}", None)
|
49 |
+
except Exception:
|
50 |
return (
|
51 |
False,
|
52 |
+
"'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?",
|
53 |
+
None,
|
54 |
)
|
|
|
|
|
55 |
return True, None, config
|
56 |
|
57 |
except ValueError:
|
58 |
return (
|
59 |
False,
|
60 |
"needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
|
61 |
+
None,
|
62 |
)
|
63 |
|
64 |
+
except Exception:
|
65 |
return False, "was not found on hub!", None
|
66 |
|
67 |
|
|
|
76 |
model_size = size_factor * model_size
|
77 |
return model_size
|
78 |
|
79 |
+
|
80 |
def get_model_arch(model_info: ModelInfo):
|
81 |
"""Gets the model architecture from the configuration"""
|
82 |
return model_info.config.get("architectures", "Unknown")
|
83 |
|
84 |
+
|
85 |
def already_submitted_models(requested_models_dir: str) -> set[str]:
|
86 |
"""Gather a list of already submitted models to avoid duplicates"""
|
87 |
depth = 1
|
|
|
96 |
continue
|
97 |
with open(os.path.join(root, file), "r") as f:
|
98 |
info = json.load(f)
|
99 |
+
if info["status"] == "FAILED":
|
100 |
continue
|
101 |
file_names.append(f"{info['model']}_{info['precision']}_{info['add_special_tokens']}")
|
102 |
|
src/submission/submit.py
CHANGED
@@ -3,17 +3,13 @@ import os
|
|
3 |
from datetime import datetime, timezone
|
4 |
|
5 |
from src.display.formatting import styled_error, styled_message, styled_warning
|
6 |
-
from src.envs import API, EVAL_REQUESTS_PATH,
|
7 |
-
from src.submission.check_validity import
|
8 |
-
already_submitted_models,
|
9 |
-
check_model_card,
|
10 |
-
get_model_size,
|
11 |
-
is_model_on_hub,
|
12 |
-
)
|
13 |
|
14 |
REQUESTED_MODELS = None
|
15 |
USERS_TO_SUBMISSION_DATES = None
|
16 |
|
|
|
17 |
def add_new_eval(
|
18 |
model: str,
|
19 |
revision: str,
|
|
|
3 |
from datetime import datetime, timezone
|
4 |
|
5 |
from src.display.formatting import styled_error, styled_message, styled_warning
|
6 |
+
from src.envs import API, EVAL_REQUESTS_PATH, QUEUE_REPO, TOKEN
|
7 |
+
from src.submission.check_validity import already_submitted_models, check_model_card, get_model_size, is_model_on_hub
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
REQUESTED_MODELS = None
|
10 |
USERS_TO_SUBMISSION_DATES = None
|
11 |
|
12 |
+
|
13 |
def add_new_eval(
|
14 |
model: str,
|
15 |
revision: str,
|