Sean Cho
add private repo
bba982c
raw
history blame
No virus
5.03 kB
import json
import os
from dataclasses import dataclass
from typing import Dict, List, Tuple
from distutils.util import strtobool
import dateutil
import numpy as np
from src.display_models.utils import AutoEvalColumn, make_clickable_model
METRICS = ["acc_norm", "acc_norm", "acc", "mc2"]
BENCHMARKS = ["ko_arc_challenge", "ko_hellaswag", "ko_mmlu", "ko_truthfulqa_mc", "ko_commongen_v2"] #, "ethicalverification"]
BENCH_TO_NAME = {
"ko_arc_challenge": AutoEvalColumn.arc.name,
"ko_hellaswag": AutoEvalColumn.hellaswag.name,
"ko_mmlu": AutoEvalColumn.mmlu.name,
"ko_truthfulqa_mc": AutoEvalColumn.truthfulqa.name,
"ko_commongen_v2": AutoEvalColumn.commongen_v2.name,
# TODO: Uncomment when we have results for these
# "ethicalverification": AutoEvalColumn.ethicalverification.name,
}
IS_PUBLIC = bool(strtobool(os.environ.get("IS_PUBLIC", "True")))
@dataclass
class EvalResult:
eval_name: str
org: str
model: str
revision: str
results: dict
precision: str = ""
model_type: str = ""
weight_type: str = ""
def to_dict(self):
from src.load_from_hub import is_model_on_hub
if self.org is not None:
base_model = f"{self.org}/{self.model}"
else:
base_model = f"{self.model}"
data_dict = {}
data_dict["eval_name"] = self.eval_name # not a column, just a save name
data_dict["weight_type"] = self.weight_type # not a column, just a save name
data_dict[AutoEvalColumn.precision.name] = self.precision
data_dict[AutoEvalColumn.model_type.name] = self.model_type
data_dict[AutoEvalColumn.model.name] = make_clickable_model(base_model)
data_dict[AutoEvalColumn.dummy.name] = base_model
data_dict[AutoEvalColumn.revision.name] = self.revision
data_dict[AutoEvalColumn.average.name] = sum([v for k, v in self.results.items()]) / 4.0
data_dict[AutoEvalColumn.still_on_hub.name] = (
is_model_on_hub(base_model, self.revision)[0] or base_model == "baseline"
)
for benchmark in BENCHMARKS:
if benchmark not in self.results.keys():
self.results[benchmark] = None
for k, v in BENCH_TO_NAME.items():
data_dict[v] = self.results[k]
return data_dict
def parse_eval_result(json_filepath: str) -> Tuple[str, list[dict]]:
with open(json_filepath) as fp:
data = json.load(fp)
try:
config = data["config"]
except KeyError:
config = data["config_general"]
model = config.get("model_name", None)
if model is None:
model = config.get("model_args", None)
model_sha = config.get("model_sha", "")
model_split = model.split("/", 1)
precision = config.get("model_dtype")
model = model_split[-1]
if len(model_split) == 1:
org = None
model = model_split[0]
result_key = f"{model}_{model_sha}_{precision}"
else:
org = model_split[0]
model = model_split[1]
result_key = f"{org}_{model}_{model_sha}_{precision}"
eval_results = []
for benchmark, metric in zip(BENCHMARKS, METRICS):
accs = np.array([v.get(metric, None) for k, v in data["results"].items() if benchmark in k])
if accs.size == 0 or any([acc is None for acc in accs]):
continue
mean_acc = np.mean(accs) * 100.0
eval_results.append(
EvalResult(
eval_name=result_key,
org=org,
model=model,
revision=model_sha,
results={benchmark: mean_acc},
precision=precision, # todo model_type=, weight_type=
)
)
return result_key, eval_results
def get_eval_results() -> List[EvalResult]:
json_filepaths = []
for root, dir, files in os.walk("eval-results" + ("-private" if not IS_PUBLIC else "")):
# We should only have json files in model results
if len(files) == 0 or any([not f.endswith(".json") for f in files]):
continue
# Sort the files by date
# store results by precision maybe?
try:
files.sort(key=lambda x: dateutil.parser.parse(x.split("_", 1)[-1][:-5]))
except dateutil.parser._parser.ParserError:
files = [files[-1]]
# up_to_date = files[-1]
for file in files:
json_filepaths.append(os.path.join(root, file))
eval_results = {}
for json_filepath in json_filepaths:
result_key, results = parse_eval_result(json_filepath)
for eval_result in results:
if result_key in eval_results.keys():
eval_results[result_key].results.update(eval_result.results)
else:
eval_results[result_key] = eval_result
eval_results = [v for v in eval_results.values()]
return eval_results
def get_eval_results_dicts() -> List[Dict]:
eval_results = get_eval_results()
return [e.to_dict() for e in eval_results]