File size: 4,297 Bytes
256a159
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
from mmengine.config import read_base

with read_base():
    from .models.qwen.hf_qwen_7b import models
    from .datasets.collections.leaderboard.qwen import datasets
    from .summarizers.leaderboard import summarizer

'''
dataset                                 version    metric            mode    qwen-7b-hf
--------------------------------------  ---------  ----------------  ------  ------------
--------- 考试 Exam ---------           -          -                 -       -
ceval                                   -          naive_average     ppl     58.65
agieval                                 -          naive_average     mixed   40.49
mmlu                                    -          naive_average     ppl     57.78
cmmlu                                   -          naive_average     ppl     58.57
GaokaoBench                             -          weighted_average  mixed   51.76
ARC-c                                   72cf91     accuracy          gen     83.73
ARC-e                                   72cf91     accuracy          gen     90.65
--------- 语言 Language ---------       -          -                 -       -
WiC                                     ce62e6     accuracy          ppl     51.10
chid-dev                                25f3d3     accuracy          ppl     86.63
afqmc-dev                               cc328c     accuracy          ppl     69.00
WSC                                     678cb5     accuracy          ppl     63.46
tydiqa-goldp                            -          naive_average     gen     19.98
flores_100                              -          naive_average     gen     3.20
--------- 知识 Knowledge ---------      -          -                 -       -
BoolQ                                   463fee     accuracy          ppl     83.00
commonsense_qa                          0d8e25     accuracy          ppl     67.49
triviaqa                                b6904f     score             gen     40.45
nq                                      b6904f     score             gen     14.16
--------- 理解 Understanding ---------  -          -                 -       -
C3                                      e6778d     accuracy          gen     75.29
race-middle                             73bdec     accuracy          ppl     90.53
race-high                               73bdec     accuracy          ppl     87.71
openbookqa_fact                         fa871c     accuracy          gen     92.20
csl_dev                                 3c4211     accuracy          ppl     56.25
lcsts                                   0b3969     rouge1            gen     12.38
Xsum                                    207e69     rouge1            gen     36.00
eprstmt-dev                             101429     accuracy          gen     89.38
lambada                                 de1af2     accuracy          gen     67.88
--------- 推理 Reasoning ---------      -          -                 -       -
cmnli                                   15e783     accuracy          ppl     54.85
ocnli                                   1471e7     accuracy          gen     42.34
AX_b                                    793c72     accuracy          gen     58.61
AX_g                                    c4c886     accuracy          gen     69.10
RTE                                     c4c886     accuracy          gen     57.76
COPA                                    59f42c     accuracy          gen     88.00
ReCoRD                                  3e0689     score             gen     27.78
hellaswag                               06a1e2     accuracy          gen     92.47
piqa                                    24369d     accuracy          gen     78.02
siqa                                    ea30d1     accuracy          ppl     75.03
math                                    2c0b9e     accuracy          gen     11.06
gsm8k                                   4c7f6e     accuracy          gen     50.87
drop                                    53a0a7     score             gen     44.95
openai_humaneval                        dd0dff     humaneval_pass@1  gen     23.78
mbpp                                    60ca11     score             gen     31.20
bbh                                     -          naive_average     gen     40.03
'''