File size: 15,409 Bytes
5a5a36e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95b7a71
 
5a5a36e
95b7a71
b9cb207
5a5a36e
95b7a71
 
5a5a36e
b9cb207
5a5a36e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b9cb207
 
5a5a36e
 
 
3f45e26
5a5a36e
 
 
 
 
 
3f45e26
5a5a36e
b9cb207
5a5a36e
 
 
 
 
 
b9cb207
5a5a36e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
08847a4
 
 
5a5a36e
 
 
 
 
 
 
08847a4
5a5a36e
 
 
 
 
08847a4
5a5a36e
08847a4
5a5a36e
 
 
 
 
 
 
08847a4
 
 
5a5a36e
 
 
 
 
 
 
08847a4
5a5a36e
 
 
 
 
08847a4
5a5a36e
08847a4
5a5a36e
 
 
 
 
 
 
 
 
 
 
b10d6d4
5a5a36e
08847a4
 
c3c6a41
5a5a36e
359bb98
 
5a5a36e
 
 
 
 
 
 
b10d6d4
 
5a5a36e
 
 
 
 
 
 
 
359bb98
 
5a5a36e
 
 
b8e5d23
5a5a36e
b8e5d23
b10d6d4
b8e5d23
5a5a36e
625f2d5
5a5a36e
 
625f2d5
653f44e
625f2d5
5a5a36e
 
 
 
b10d6d4
 
 
6dd427b
5a5a36e
 
625f2d5
 
5a5a36e
 
 
 
6dd427b
 
653f44e
625f2d5
653f44e
 
 
625f2d5
5a5a36e
 
 
b8e5d23
3f45e26
5a5a36e
 
 
 
b8e5d23
5a5a36e
 
 
 
 
 
 
 
 
 
 
b8e5d23
 
5a5a36e
b9cb207
b8e5d23
b9cb207
 
 
 
 
228e920
b9cb207
 
 
 
 
 
 
 
 
 
 
 
 
228e920
 
b9cb207
 
 
5a5a36e
 
 
 
b10d6d4
 
5a5a36e
653f44e
 
 
5a5a36e
 
 
 
 
 
 
b10d6d4
 
 
 
5a5a36e
 
653f44e
 
 
 
 
 
5a5a36e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1b6c696
 
 
5a5a36e
c3c6a41
 
 
 
 
 
 
1b6c696
 
 
c3c6a41
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
from dataclasses import dataclass, make_dataclass
from enum import Enum

import pandas as pd

def fields(raw_class):
    return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]


@dataclass
class Task:
    benchmark: str
    metric: str
    col_name: str

class Tasks(Enum):
    arc = Task("arc:challenge", "acc,none", "ARC-c")
    arc_easy = Task("arc:easy", "acc,none", "ARC-e")
    boolq = Task("boolq", "acc,none", "Boolq")
    hellaswag = Task("hellaswag", "acc,none", "HellaSwag")
    lambada_openai = Task("lambada:openai", "acc,none", "Lambada")
    mmlu = Task("mmlu", "acc,none", "MMLU")
    openbookqa = Task("openbookqa", "acc,none", "Openbookqa")
    piqa = Task("piqa", "acc,none", "Piqa")
    # truthfulqa:mc1 / truthfulqa:mc2 -- ?
    truthfulqa_mc = Task("truthfulqa:mc1", "acc,none", "Truthfulqa")
    # arc:challenge ?
    # arc_challenge = Task("arc:challenge", "acc_norm,none", "Arc challenge")
    # truthfulqa = Task("truthfulqa:mc", "mc2", "TruthfulQA")
    winogrande = Task("winogrande", "acc,none", "Winogrande")
    # gsm8k = Task("gsm8k", "acc", "GSM8K")

# These classes are for user facing column names,
# to avoid having to change them all around the code
# when a modif is needed
@dataclass
class ColumnContent:
    name: str
    type: str
    displayed_by_default: bool
    hidden: bool = False
    never_hidden: bool = False
    dummy: bool = False

auto_eval_column_dict = []
# Init
auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
#Scores
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
for task in Tasks:
    auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", True)])
auto_eval_column_dict.append(["model_size", ColumnContent, ColumnContent("#Size (G)", "number", True)])
# Dummy column for the search bar (hidden by the custom CSS)
auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)])
# Model information
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False, hidden=True)])
auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
auto_eval_column_dict.append(["quant_type", ColumnContent, ColumnContent("Quant type", "str", False)])
auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
auto_eval_column_dict.append(["weight_dtype", ColumnContent, ColumnContent("Weight dtype", "str", False)])
auto_eval_column_dict.append(["compute_dtype", ColumnContent, ColumnContent("Compute dtype", "str", False)])
auto_eval_column_dict.append(["merged", ColumnContent, ColumnContent("Merged", "bool", False, hidden=True)])
auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
# auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❀️", "number", False)])
auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False, hidden=True)])
auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
auto_eval_column_dict.append(["flagged", ColumnContent, ColumnContent("Flagged", "bool", False, hidden=True)])
auto_eval_column_dict.append(["moe", ColumnContent, ColumnContent("MoE", "bool", False, hidden=True)])
auto_eval_column_dict.append(["double_quant", ColumnContent, ColumnContent("Double Quant", "bool", False)])
auto_eval_column_dict.append(["group_size", ColumnContent, ColumnContent("Group Size", "bool", False)])
# We use make dataclass to dynamically fill the scores from Tasks
# auto_eval_column_dict.sort(key=lambda x: x[0])
sorted_columns = sorted(auto_eval_column_dict[3:], key=lambda x: x[0])
sorted_auto_eval_column_dict = auto_eval_column_dict[:3] + sorted_columns
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)

@dataclass(frozen=True)
class EvalQueueColumn:  # Queue column
    model = ColumnContent("model", "markdown", True)
    revision = ColumnContent("revision", "str", True)
    private = ColumnContent("private", "bool", True)
    precision = ColumnContent("precision", "str", True)
    weight_type = ColumnContent("weight_type", "str", "Original")
    status = ColumnContent("status", "str", True)


baseline_row = {
    AutoEvalColumn.model.name: "<p>Baseline</p>",
    AutoEvalColumn.revision.name: "N/A",
    AutoEvalColumn.precision.name: None,
    AutoEvalColumn.merged.name: False,
    AutoEvalColumn.average.name: 31.0,
    AutoEvalColumn.arc.name: 25.0,
    # AutoEvalColumn.hellaswag.name: 25.0,
    # AutoEvalColumn.truthfulqa.name: 25.0,
    AutoEvalColumn.winogrande.name: 50.0,
    # AutoEvalColumn.gsm8k.name: 0.21,
    AutoEvalColumn.dummy.name: "baseline",
    AutoEvalColumn.model_type.name: "",
    AutoEvalColumn.flagged.name: False,
    # low-bite new params
    AutoEvalColumn.mmlu.name: 25.0,
    AutoEvalColumn.lambada_openai.name: 25.0,
    AutoEvalColumn.hellaswag.name: 25.0,
    AutoEvalColumn.piqa.name: 25.0,
    AutoEvalColumn.truthfulqa_mc.name: 25.0,
    AutoEvalColumn.openbookqa.name: 25.0,
    AutoEvalColumn.boolq.name: True,
    AutoEvalColumn.arc_easy.name: 25.0,
    AutoEvalColumn.double_quant.name: False,
}

# Average ⬆️ human baseline is 0.897 (source: averaging human baselines below)
# ARC human baseline is 0.80 (source: https://lab42.global/arc/)
# HellaSwag human baseline is 0.95 (source: https://deepgram.com/learn/hellaswag-llm-benchmark-guide)
# MMLU human baseline is 0.898 (source: https://openreview.net/forum?id=d7KBjmI3GmQ)
# TruthfulQA human baseline is 0.94(source: https://arxiv.org/pdf/2109.07958.pdf)
# Winogrande: https://leaderboard.allenai.org/winogrande/submissions/public
# GSM8K: paper
# Define the human baselines
human_baseline_row = {
    AutoEvalColumn.model.name: "<p>Human performance</p>",
    AutoEvalColumn.revision.name: "N/A",
    AutoEvalColumn.precision.name: None,
    AutoEvalColumn.average.name: 92.75,
    AutoEvalColumn.merged.name: False,
    AutoEvalColumn.arc.name: 80.0,
    # AutoEvalColumn.hellaswag.name: 95.0,
    # AutoEvalColumn.mmlu.name: 89.8,
    # AutoEvalColumn.truthfulqa.name: 94.0,
    AutoEvalColumn.winogrande.name: 94.0,
    # AutoEvalColumn.gsm8k.name: 100,
    AutoEvalColumn.dummy.name: "human_baseline",
    AutoEvalColumn.model_type.name: "",
    AutoEvalColumn.flagged.name: False,
}

@dataclass
class ModelDetails:
    name: str
    symbol: str = "" # emoji, only for the model type

"""
class ModelType(Enum):
    PT = ModelDetails(name="GPTQ", symbol="🟒")
    CPT = ModelDetails(name="AWQ", symbol="🟩")
    FT = ModelDetails(name="llama.cpp", symbol="πŸ”·")
    chat = ModelDetails(name="Bisandbytes", symbol="πŸ”΅")
    merges = ModelDetails(name="AutoRound", symbol="πŸ’")
    Unknown = ModelDetails(name="", symbol="?")

    def to_str(self, separator=" "):
        return f"{self.value.symbol}{separator}{self.value.name}"

    @staticmethod
    def from_str(type):
        if "fine-tuned" in type or "πŸ”·" in type:
            return ModelType.FT
        if "continously pretrained" in type or "🟩" in type:
            return ModelType.CPT
        if "pretrained" in type or "🟒" in type:
            return ModelType.PT
        if any([k in type for k in ["instruction-tuned", "RL-tuned", "chat", "🟦", "β­•", "πŸ”΅"]]):
            return ModelType.chat
        if "merge" in type or "πŸ’" in type:
            return ModelType.merges
        return ModelType.Unknown
"""

class ModelType(Enum):
    PT = ModelDetails(name="pretrained", symbol="🟒")
    CPT = ModelDetails(name="continuously pretrained", symbol="🟩")
    FT = ModelDetails(name="fine-tuned on domain-specific datasets", symbol="πŸ”·")
    chat = ModelDetails(name="chat models (RLHF, DPO, IFT, ...)", symbol="πŸ”΅")
    merges = ModelDetails(name="base merges and moerges", symbol="πŸ’")
    Unknown = ModelDetails(name="", symbol="?")

    def to_str(self, separator=" "):
        return f"{self.value.symbol}{separator}{self.value.name}"

    @staticmethod
    def from_str(type):
        if "fine-tuned" in type or "πŸ”·" in type:
            return ModelType.FT
        if "continously pretrained" in type or "🟩" in type:
            return ModelType.CPT
        if "pretrained" in type or "🟒" in type or "quantization" in type:
            return ModelType.PT
        if any([k in type for k in ["instruction-tuned", "RL-tuned", "chat", "🟦", "β­•", "πŸ”΅"]]):
            return ModelType.chat
        if "merge" in type or "πŸ’" in type:
            return ModelType.merges
        return ModelType.Unknown

class WeightType(Enum):
    Adapter = ModelDetails("Adapter")
    Original = ModelDetails("Original")
    Delta = ModelDetails("Delta")
    

class QuantType(Enum):
    gptq = ModelDetails(name="GPTQ", symbol="🟒")
    aqlm = ModelDetails(name="AQLM", symbol="⭐")
    awq = ModelDetails(name="AWQ", symbol="🟩")
    llama_cpp = ModelDetails(name="llama.cpp", symbol="πŸ”·")
    bnb = ModelDetails(name="bitsandbytes", symbol="πŸ”΅")
    autoround = ModelDetails(name="AutoRound", symbol="πŸ’")
    Unknown = ModelDetails(name="?", symbol="?")
    QuantType_None = ModelDetails(name="None", symbol="βœ–")


    def to_str(self, separator=" "):
        return f"{self.value.symbol}{separator}{self.value.name}"

    def from_str(quant_dtype):
        if quant_dtype in ["GPTQ"]:
            return QuantType.gptq
        if quant_dtype in ["AQLM"]:
            return QuantType.aqlm
        if quant_dtype in ["AWQ"]:
            return QuantType.awq
        if quant_dtype in ["llama.cpp"]:
            return QuantType.llama_cpp
        if quant_dtype in ["bitsandbytes"]:
            return QuantType.bnb
        if quant_dtype in ["AutoRound"]:
            return QuantType.autoround
        if quant_dtype in ["None"]:
            return QuantType.QuantType_None            
        return QuantType.Unknown



class WeightDtype(Enum):
    all = ModelDetails("All")
    int2 = ModelDetails("int2")
    int3 = ModelDetails("int3")    
    int4 = ModelDetails("int4")
    int8 = ModelDetails("int8")    
    nf4 = ModelDetails("nf4")
    fp4 = ModelDetails("fp4")
    f16 = ModelDetails("float16")
    bf16 = ModelDetails("bfloat16")
    f32 = ModelDetails("float32")

    Unknown = ModelDetails("?")

    def from_str(weight_dtype):
        if weight_dtype in ["int2"]:
            return WeightDtype.int2
        if weight_dtype in ["int3"]:
            return WeightDtype.int3        
        if weight_dtype in ["int4"]:
            return WeightDtype.int4
        if weight_dtype in ["int8"]:
            return WeightDtype.int8        
        if weight_dtype in ["nf4"]:
            return WeightDtype.nf4
        if weight_dtype in ["fp4"]:
            return WeightDtype.fp4
        if weight_dtype in ["All"]:
            return WeightDtype.all
        if weight_dtype in ["float16"]:
            return WeightDtype.f16
        if weight_dtype in ["bfloat16"]:
            return WeightDtype.bf16
        if weight_dtype in ["float32"]:
            return WeightDtype.f32        
        return WeightDtype.Unknown

class ComputeDtype(Enum):
    all = ModelDetails("All")
    fp16 = ModelDetails("float16")
    bf16 = ModelDetails("bfloat16")
    int8 = ModelDetails("int8")
    fp32 = ModelDetails("float32")


    Unknown = ModelDetails("?")

    def from_str(compute_dtype):
        if compute_dtype in ["bfloat16"]:
            return ComputeDtype.bf16
        if compute_dtype in ["float16"]:
            return ComputeDtype.fp16
        if compute_dtype in ["int8"]:
            return ComputeDtype.int8
        if compute_dtype in ["float32"]:
            return ComputeDtype.fp32
        if compute_dtype in ["All"]:
            return ComputeDtype.all        
        return ComputeDtype.Unknown
    
    
class GroupDtype(Enum):
    group_1 = ModelDetails("-1")
    group_1024 = ModelDetails("1024")
    group_256 = ModelDetails("256")
    group_128 = ModelDetails("128")
    group_64 = ModelDetails("64")
    group_32 = ModelDetails("32")

    group_all = ModelDetails("All")

    def from_str(compute_dtype):
        if compute_dtype in ["-1"]:
            return GroupDtype.group_1
        if compute_dtype in ["1024"]:
            return GroupDtype.group_1024
        if compute_dtype in ["256"]:
            return GroupDtype.group_256
        if compute_dtype in ["128"]:
            return GroupDtype.group_128
        if compute_dtype in ["64"]:
            return GroupDtype.group_64
        if compute_dtype in ["32"]:
            return GroupDtype.group_32       
        return GroupDtype.group_all    

class Precision(Enum):
    # float16 = ModelDetails("float16")
    # bfloat16 = ModelDetails("bfloat16")
    qt_2bit = ModelDetails("2bit")
    qt_3bit = ModelDetails("3bit")
    qt_4bit = ModelDetails("4bit")
    qt_8bit = ModelDetails("8bit")
    qt_16bit = ModelDetails("16bit")
    qt_32bit = ModelDetails("32bit")
    Unknown = ModelDetails("?")

    def from_str(precision):
        # if precision in ["torch.float16", "float16"]:
        #     return Precision.float16
        # if precision in ["torch.bfloat16", "bfloat16"]:
        #     return Precision.bfloat16
        if precision in ["2bit"]:
            return Precision.qt_2bit
        if precision in ["3bit"]:
            return Precision.qt_3bit
        if precision in ["4bit"]:
            return Precision.qt_4bit
        if precision in ["8bit"]:
            return Precision.qt_8bit
        if precision in ["16bit"]:
            return Precision.qt_16bit
        if precision in ["32bit"]:
            return Precision.qt_32bit
        return Precision.Unknown




# Column selection
COLS = [c.name for c in fields(AutoEvalColumn)]
TYPES = [c.type for c in fields(AutoEvalColumn)]

EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]

BENCHMARK_COLS = [t.value.col_name for t in Tasks]

NUMERIC_INTERVALS = {
    "?": pd.Interval(-1, 0, closed="right"),
    "~1.5": pd.Interval(0, 2, closed="right"),
    "~3": pd.Interval(2, 4, closed="right"),
    "~7": pd.Interval(4, 9, closed="right"),
    "~13": pd.Interval(9, 20, closed="right"),
    "~35": pd.Interval(20, 45, closed="right"),
    "~60": pd.Interval(45, 70, closed="right"),
    "70+": pd.Interval(70, 10000, closed="right"),
}

NUMERIC_MODELSIZE = {
    "?": pd.Interval(-1, 0, closed="right"),
    "~4": pd.Interval(0, 4, closed="right"),
    "~8": pd.Interval(4, 8, closed="right"),
    "~16": pd.Interval(8, 16, closed="right"),
    "~36": pd.Interval(16, 36, closed="right"),
    "~48": pd.Interval(36, 48, closed="right"),
    "~64": pd.Interval(48, 64, closed="right"),
    "~72": pd.Interval(64, 80, closed="right"),
}