File size: 5,083 Bytes
4f0083e
 
ce5c604
4f0083e
bba982c
4f0083e
ce5c604
4f0083e
 
ce5c604
 
0eb1488
 
330b83c
4f0083e
150c99b
 
 
6313532
a507ee8
6313532
 
4f0083e
bba982c
4f0083e
 
 
 
 
 
 
 
 
 
 
 
 
ce5c604
 
4f0083e
 
 
 
 
 
ce5c604
4f0083e
 
 
 
 
 
db927da
ce5c604
 
 
4f0083e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bcb8d03
4f0083e
 
 
bcb8d03
4f0083e
 
 
ce5c604
 
4f0083e
 
ce5c604
 
 
 
 
 
 
 
 
 
4f0083e
 
 
 
ce5c604
4f0083e
 
bba982c
4f0083e
 
 
 
 
 
 
ce5c604
4f0083e
 
 
ce5c604
4f0083e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ce5c604
 
4f0083e
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import json
import os
from dataclasses import dataclass
from typing import Dict, List, Tuple
from distutils.util import strtobool

import dateutil
import numpy as np

from src.display_models.utils import AutoEvalColumn, make_clickable_model

# 현우 - ko_commongen_v2 : acc_norm인지 체크 필요함  
METRICS = ["acc_norm", "acc_norm", "acc", "mc2", "acc_norm"]
BENCHMARKS = ["ko_arc_challenge", "ko_hellaswag", "ko_mmlu", "ko_truthfulqa_mc", "ko_commongen_v2"] #, "ethicalverification"]
BENCH_TO_NAME = {
    "ko_arc_challenge": AutoEvalColumn.arc.name,
    "ko_hellaswag": AutoEvalColumn.hellaswag.name,
    "ko_mmlu": AutoEvalColumn.mmlu.name,
    "ko_truthfulqa_mc": AutoEvalColumn.truthfulqa.name,
    "ko_commongen_v2": AutoEvalColumn.commongen_v2.name,
    # TODO: Uncomment when we have results for these
    # "ethicalverification": AutoEvalColumn.ethicalverification.name,
}
IS_PUBLIC = bool(strtobool(os.environ.get("IS_PUBLIC", "True")))

@dataclass
class EvalResult:
    eval_name: str
    org: str
    model: str
    revision: str
    results: dict
    precision: str = ""
    model_type: str = ""
    weight_type: str = ""

    def to_dict(self):
        from src.load_from_hub import is_model_on_hub

        if self.org is not None:
            base_model = f"{self.org}/{self.model}"
        else:
            base_model = f"{self.model}"
        data_dict = {}

        data_dict["eval_name"] = self.eval_name  # not a column, just a save name
        data_dict["weight_type"] = self.weight_type  # not a column, just a save name
        data_dict[AutoEvalColumn.precision.name] = self.precision
        data_dict[AutoEvalColumn.model_type.name] = self.model_type
        data_dict[AutoEvalColumn.model.name] = make_clickable_model(base_model)
        data_dict[AutoEvalColumn.dummy.name] = base_model
        data_dict[AutoEvalColumn.revision.name] = self.revision
        data_dict[AutoEvalColumn.average.name] = sum([v for k, v in self.results.items()]) / 5.0
        data_dict[AutoEvalColumn.still_on_hub.name] = (
            is_model_on_hub(base_model, self.revision)[0] or base_model == "baseline"
        )

        for benchmark in BENCHMARKS:
            if benchmark not in self.results.keys():
                self.results[benchmark] = None

        for k, v in BENCH_TO_NAME.items():
            data_dict[v] = self.results[k]

        return data_dict


def parse_eval_result(json_filepath: str) -> Tuple[str, list[dict]]:
    with open(json_filepath) as fp:
        data = json.load(fp)

    try:
        config = data["config"]
    except KeyError:
        config = data["config_general"]
    model = config.get("model_name", None)
    if model is None:
        model = config.get("model_args", None)

    model_sha = config.get("model_sha", "")
    model_split = model.split("/", 1)

    precision = config.get("model_dtype")

    model = model_split[-1]

    if len(model_split) == 1:
        org = None
        model = model_split[0]
        result_key = f"{model}_{precision}"
    else:
        org = model_split[0]
        model = model_split[1]
        result_key = f"{org}_{model}_{precision}"

    eval_results = []
    for benchmark, metric in zip(BENCHMARKS, METRICS):
        accs = np.array([v.get(metric, None) for k, v in data["results"].items() if benchmark in k])
        if accs.size == 0 or any([acc is None for acc in accs]):
            continue
        mean_acc = np.mean(accs) * 100.0
        eval_results.append(
            EvalResult(
                eval_name=result_key,
                org=org,
                model=model,
                revision=model_sha,
                results={benchmark: mean_acc},
                precision=precision,  # todo model_type=, weight_type=
            )
        )

    return result_key, eval_results


def get_eval_results() -> List[EvalResult]:
    json_filepaths = []

    for root, dir, files in os.walk("eval-results" + ("-private" if not IS_PUBLIC else "")):
        # We should only have json files in model results
        if len(files) == 0 or any([not f.endswith(".json") for f in files]):
            continue

        # Sort the files by date
        # store results by precision maybe?
        try:
            files.sort(key=lambda x: dateutil.parser.parse(x.split("_", 1)[-1][:-5]))
        except dateutil.parser._parser.ParserError:
            files = [files[-1]]

        # up_to_date = files[-1]
        for file in files:
            json_filepaths.append(os.path.join(root, file))

    eval_results = {}
    for json_filepath in json_filepaths:
        result_key, results = parse_eval_result(json_filepath)
        for eval_result in results:
            if result_key in eval_results.keys():
                eval_results[result_key].results.update(eval_result.results)
            else:
                eval_results[result_key] = eval_result

    eval_results = [v for v in eval_results.values()]

    return eval_results


def get_eval_results_dicts() -> List[Dict]:
    eval_results = get_eval_results()

    return [e.to_dict() for e in eval_results]