isakzhang commited on
Commit
d4c15d2
1 Parent(s): 01bb2e0

update scripts

Browse files
eval-results/.gitattributes ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.lz4 filter=lfs diff=lfs merge=lfs -text
12
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
13
+ *.model filter=lfs diff=lfs merge=lfs -text
14
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
15
+ *.npy filter=lfs diff=lfs merge=lfs -text
16
+ *.npz filter=lfs diff=lfs merge=lfs -text
17
+ *.onnx filter=lfs diff=lfs merge=lfs -text
18
+ *.ot filter=lfs diff=lfs merge=lfs -text
19
+ *.parquet filter=lfs diff=lfs merge=lfs -text
20
+ *.pb filter=lfs diff=lfs merge=lfs -text
21
+ *.pickle filter=lfs diff=lfs merge=lfs -text
22
+ *.pkl filter=lfs diff=lfs merge=lfs -text
23
+ *.pt filter=lfs diff=lfs merge=lfs -text
24
+ *.pth filter=lfs diff=lfs merge=lfs -text
25
+ *.rar filter=lfs diff=lfs merge=lfs -text
26
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
27
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
29
+ *.tar filter=lfs diff=lfs merge=lfs -text
30
+ *.tflite filter=lfs diff=lfs merge=lfs -text
31
+ *.tgz filter=lfs diff=lfs merge=lfs -text
32
+ *.wasm filter=lfs diff=lfs merge=lfs -text
33
+ *.xz filter=lfs diff=lfs merge=lfs -text
34
+ *.zip filter=lfs diff=lfs merge=lfs -text
35
+ *.zst filter=lfs diff=lfs merge=lfs -text
36
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
37
+ # Audio files - uncompressed
38
+ *.pcm filter=lfs diff=lfs merge=lfs -text
39
+ *.sam filter=lfs diff=lfs merge=lfs -text
40
+ *.raw filter=lfs diff=lfs merge=lfs -text
41
+ # Audio files - compressed
42
+ *.aac filter=lfs diff=lfs merge=lfs -text
43
+ *.flac filter=lfs diff=lfs merge=lfs -text
44
+ *.mp3 filter=lfs diff=lfs merge=lfs -text
45
+ *.ogg filter=lfs diff=lfs merge=lfs -text
46
+ *.wav filter=lfs diff=lfs merge=lfs -text
47
+ # Image files - uncompressed
48
+ *.bmp filter=lfs diff=lfs merge=lfs -text
49
+ *.gif filter=lfs diff=lfs merge=lfs -text
50
+ *.png filter=lfs diff=lfs merge=lfs -text
51
+ *.tiff filter=lfs diff=lfs merge=lfs -text
52
+ # Image files - compressed
53
+ *.jpg filter=lfs diff=lfs merge=lfs -text
54
+ *.jpeg filter=lfs diff=lfs merge=lfs -text
55
+ *.webp filter=lfs diff=lfs merge=lfs -text
eval-results/SeaExam_results.csv ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ,,,,M3Exam,,,,,,,MMLU,,,,,,,AVG,,,,,,
2
+ Model,type,open?,shot,en,zh,id,th,vi,avg,avg_sea,en,zh,id,th,vi,avg,avg_sea,en,zh,id,th,vi,avg,avg_sea
3
+ gpt-4-turbo-1106,chat,N,0,0.87683,0.78882,0.64873,0.68956,0.70774,0.74234,0.68201,0.79825,0.72912,0.74526,0.67088,0.71053,0.73081,0.70889,0.83754,0.75897,0.69700,0.68022,0.70913,0.73657,0.69545
4
+ Meta-Llama-3-70B,base,Y,3,0.84382,0.75621,0.61899,0.66181,0.68252,0.71267,0.65444,0.78772,0.70491,0.73509,0.65930,0.70526,0.71846,0.69988,0.81577,0.73056,0.67704,0.66055,0.69389,0.71556,0.67716
5
+ Meta-Llama-3-70B-Instruct,chat,Y,3,0.86321,0.69410,0.62975,0.64299,0.68424,0.70286,0.65233,0.79965,0.69088,0.72316,0.63228,0.68912,0.70702,0.68152,0.83143,0.69249,0.67645,0.63764,0.68668,0.70494,0.66692
6
+ Qwen1.5-72B,base,Y,3,0.83857,0.92547,0.58734,0.56820,0.64756,0.71343,0.60104,0.74491,0.69474,0.66456,0.56351,0.63088,0.65972,0.61965,0.79174,0.81010,0.62595,0.56586,0.63922,0.68657,0.61034
7
+ claude-3-sonnet-20240229,chat,N,0,0.78878,0.68323,0.58544,0.57150,0.62579,0.65095,0.59424,0.71333,0.60456,0.63684,0.54070,0.58421,0.61593,0.58725,0.75106,0.64390,0.61114,0.55610,0.60500,0.63344,0.59075
8
+ claude-3-haiku-20240307,chat,N,0,0.79036,0.65217,0.56266,0.57291,0.63095,0.64181,0.58884,0.71053,0.60526,0.61193,0.51474,0.56316,0.60112,0.56327,0.75044,0.62872,0.58729,0.54382,0.59705,0.62147,0.57606
9
+ dbrx-base,base,Y,3,0.80818,0.68944,0.53418,0.50659,0.60458,0.62859,0.54845,0.73123,0.64281,0.64456,0.47368,0.61754,0.62196,0.57860,0.76970,0.66612,0.58937,0.49013,0.61106,0.62528,0.56352
10
+ Mixtral-8x22B-v0.1,base,Y,3,0.83910,0.69565,0.56962,0.48730,0.60115,0.63856,0.55269,0.76877,0.62491,0.64667,0.45018,0.57649,0.61340,0.55778,0.80394,0.66028,0.60814,0.46874,0.58882,0.62598,0.55523
11
+ SeaLLM-7B-v2.5,chat,Y,3,0.75943,0.60248,0.50063,0.50659,0.61834,0.59749,0.54185,0.64877,0.53719,0.56772,0.48667,0.53018,0.55411,0.52819,0.70410,0.56984,0.53418,0.49663,0.57426,0.57580,0.53502
12
+ Qwen1.5-14B,base,Y,3,0.79665,0.86180,0.52722,0.47836,0.54900,0.64260,0.51819,0.67509,0.60211,0.55719,0.44491,0.52351,0.56056,0.50854,0.73587,0.73195,0.54220,0.46164,0.53625,0.60158,0.51336
13
+ gemini-1.0-pro,chat,N,0,0.56866,0.72516,0.43987,0.49247,0.60516,0.56626,0.51250,0.54912,0.59684,0.53368,0.43895,0.55298,0.53432,0.50854,0.55889,0.66100,0.48678,0.46571,0.57907,0.55029,0.51052
14
+ gemma-7b,base,Y,3,0.73061,0.52795,0.46456,0.46284,0.59656,0.55650,0.50799,0.63579,0.50772,0.55228,0.48842,0.49684,0.53621,0.51251,0.68320,0.51783,0.50842,0.47563,0.54670,0.54636,0.51025
15
+ gpt-3.5-turbo-0125,,N,3,0.75105,0.58851,0.50000,0.38852,0.53352,0.55232,0.47402,0.68211,0.54912,0.59088,0.38596,0.50246,0.54211,0.49310,0.71658,0.56882,0.54544,0.38724,0.51799,0.54721,0.48356
16
+ Mixtral-8x7B-v0.1,base,Y,3,0.77096,0.60559,0.47975,0.43509,0.52206,0.56269,0.47897,0.70351,0.54140,0.56632,0.39298,0.49404,0.53965,0.48444,0.73724,0.57350,0.52303,0.41404,0.50805,0.55117,0.48171
17
+ Llama-2-70b-hf,base,Y,3,0.74895,0.59938,0.49177,0.34478,0.55931,0.54884,0.46529,0.68526,0.55965,0.58982,0.32737,0.52035,0.53649,0.47918,0.71711,0.57951,0.54080,0.33607,0.53983,0.54267,0.47223
18
+ Meta-Llama-3-8B,base,Y,3,0.70021,0.54037,0.42722,0.45390,0.50888,0.52612,0.46333,0.63193,0.48561,0.51158,0.43579,0.49053,0.51109,0.47930,0.66607,0.51299,0.46940,0.44485,0.49970,0.51860,0.47132
19
+ Sailor-7B-Chat,chat,Y,3,0.65618,0.65062,0.47405,0.46425,0.51175,0.55137,0.48335,0.55579,0.47509,0.48526,0.41789,0.46105,0.47902,0.45474,0.60599,0.56285,0.47966,0.44107,0.48640,0.51519,0.46904
20
+ gpt-3.5-turbo-0125,chat,N,0,0.75577,0.60559,0.49304,0.39652,0.52894,0.55597,0.47283,0.67228,0.53018,0.56667,0.36070,0.46281,0.51853,0.46339,0.71402,0.56788,0.52985,0.37861,0.49587,0.53725,0.46811
21
+ Yi-34B,base,Y,3,0.81499,0.86025,0.54114,0.38147,0.50201,0.61997,0.47487,0.75860,0.68386,0.60105,0.31439,0.45018,0.56161,0.45520,0.78679,0.77205,0.57110,0.34793,0.47609,0.59079,0.46504
22
+ Meta-Llama-3-8B-Instruct,chat,Y,3,0.72537,0.53727,0.46646,0.37065,0.50946,0.52184,0.44885,0.64912,0.48246,0.50421,0.36702,0.47544,0.49565,0.44889,0.68724,0.50986,0.48533,0.36883,0.49245,0.50874,0.44887
23
+ SeaLLM-7B-v2,chat,Y,3,0.70178,0.51553,0.43165,0.40593,0.51519,0.51401,0.45092,0.61474,0.45930,0.49158,0.36246,0.44246,0.47411,0.43216,0.65826,0.48741,0.46161,0.38419,0.47882,0.49406,0.44154
24
+ Sailor-7B,base,Y,3,0.61111,0.63199,0.44304,0.40969,0.49914,0.51899,0.45062,0.52456,0.44737,0.45614,0.40070,0.43754,0.45326,0.43146,0.56784,0.53968,0.44959,0.40520,0.46834,0.48613,0.44104
25
+ Qwen1.5-7B-Chat,chat,Y,3,0.64570,0.62733,0.43038,0.39793,0.49226,0.51872,0.44019,0.58351,0.51579,0.42772,0.36316,0.44667,0.46737,0.41251,0.61461,0.57156,0.42905,0.38054,0.46947,0.49304,0.42635
26
+ Yi-9B,base,Y,3,0.77516,0.79193,0.49241,0.35748,0.45330,0.57405,0.43439,0.67684,0.59263,0.50772,0.29404,0.38140,0.49053,0.39439,0.72600,0.69228,0.50006,0.32576,0.41735,0.53229,0.41439
27
+ Qwen1.5-7B,base,Y,3,0.72117,0.81056,0.44114,0.36124,0.44986,0.55679,0.41741,0.61228,0.51509,0.45895,0.34105,0.41333,0.46814,0.40444,0.66673,0.66282,0.45004,0.35115,0.43159,0.51247,0.41093
28
+ Mistral-7B-v0.1,base,Y,3,0.67715,0.49689,0.42152,0.34572,0.40860,0.46998,0.39194,0.60877,0.45754,0.47053,0.31579,0.40351,0.45123,0.39661,0.64296,0.47722,0.44602,0.33075,0.40605,0.46060,0.39428
29
+ gemma-7b-it,chat,Y,3,0.62159,0.42702,0.37342,0.32079,0.46705,0.44197,0.38709,0.52421,0.42632,0.41719,0.34456,0.39298,0.42105,0.38491,0.57290,0.42667,0.39531,0.33268,0.43002,0.43151,0.38600
30
+ Mistral-7B-Instruct-v0.2,chat,Y,3,0.65671,0.49534,0.40443,0.30386,0.39885,0.45184,0.36905,0.58877,0.43404,0.44246,0.32596,0.38211,0.43467,0.38351,0.62274,0.46469,0.42344,0.31491,0.39048,0.44325,0.37628
31
+ Qwen1.5-4B,base,Y,3,0.66352,0.77174,0.35127,0.31891,0.38854,0.49879,0.35290,0.55018,0.46807,0.39298,0.31193,0.36947,0.41853,0.35813,0.60685,0.61990,0.37212,0.31542,0.37901,0.45866,0.35552
32
+ Yi-6B,base,Y,3,0.70440,0.80901,0.41076,0.29821,0.37020,0.51852,0.35972,0.62175,0.54316,0.43825,0.26140,0.33368,0.43965,0.34444,0.66308,0.67608,0.42450,0.27981,0.35194,0.47908,0.35208
33
+ Llama-2-13b-hf,base,Y,3,0.60535,0.36491,0.38418,0.28786,0.40860,0.41018,0.36021,0.53368,0.38877,0.42421,0.24175,0.36386,0.39046,0.34327,0.56952,0.37684,0.40419,0.26481,0.38623,0.40032,0.35174
34
+ Llama-2-13b-chat-hf,chat,Y,3,0.58910,0.38199,0.37152,0.28833,0.38968,0.40412,0.34985,0.53088,0.38281,0.40351,0.25789,0.34561,0.38414,0.33567,0.55999,0.38240,0.38751,0.27311,0.36765,0.39413,0.34276
35
+ Qwen1.5-MoE-A2.7B,base,Y,3,0.62788,0.78882,0.36582,0.25400,0.40172,0.48765,0.34051,0.56456,0.49123,0.40772,0.26070,0.31684,0.40821,0.32842,0.59622,0.64002,0.38677,0.25735,0.35928,0.44793,0.33447
36
+ gemma-2b-it,chat,Y,3,0.43868,0.37733,0.31646,0.28363,0.35702,0.35462,0.31904,0.37789,0.33614,0.33930,0.30526,0.32035,0.33579,0.32164,0.40829,0.35673,0.32788,0.29445,0.33869,0.34521,0.32034
37
+ Llama-2-7b-chat-hf,chat,Y,3,0.56604,0.32609,0.34114,0.26811,0.34040,0.36835,0.31655,0.48211,0.35509,0.35789,0.25684,0.33298,0.35698,0.31591,0.52407,0.34059,0.34952,0.26248,0.33669,0.36267,0.31623
38
+ bloomz-7b1,chat,Y,3,0.43082,0.37733,0.36139,0.25588,0.35645,0.35637,0.32457,0.36561,0.34386,0.32526,0.22386,0.31684,0.31509,0.28865,0.39822,0.36059,0.34333,0.23987,0.33664,0.33573,0.30661
39
+ gemma-2b,base,Y,3,0.41719,0.27484,0.30443,0.28645,0.31576,0.31974,0.30221,0.37860,0.30281,0.31474,0.30070,0.30667,0.32070,0.30737,0.39789,0.28883,0.30958,0.29358,0.31121,0.32022,0.30479
40
+ Llama-2-7b-hf,base,Y,3,0.49109,0.32298,0.30823,0.26341,0.31748,0.34064,0.29637,0.44982,0.33439,0.34421,0.25930,0.30877,0.33930,0.30409,0.47046,0.32868,0.32622,0.26135,0.31313,0.33997,0.30023
41
+ Qwen1.5-1.8B,base,Y,3,0.54612,0.71273,0.32595,0.24365,0.32378,0.43045,0.29779,0.46211,0.39018,0.32702,0.24456,0.32281,0.34933,0.29813,0.50411,0.55145,0.32648,0.24411,0.32329,0.38989,0.29796
42
+ Qwen1.5-0.5B,base,Y,3,0.44602,0.61025,0.29367,0.26011,0.29742,0.38149,0.28373,0.38737,0.32421,0.29649,0.28456,0.29965,0.31846,0.29357,0.41669,0.46723,0.29508,0.27234,0.29854,0.34998,0.28865
43
+ sea-lion-7b-instruct,chat,Y,3,0.26992,0.27329,0.28671,0.26435,0.26877,0.27261,0.27327,0.26947,0.26070,0.25684,0.26526,0.25474,0.26140,0.25895,0.26969,0.26700,0.27178,0.26480,0.26175,0.26700,0.26611
44
+ sea-lion-7b,base,Y,3,0.24476,0.22826,0.25443,0.26435,0.24126,0.24661,0.25335,0.24772,0.26175,0.24982,0.24491,0.26351,0.25354,0.25275,0.24624,0.24501,0.25213,0.25463,0.25238,0.25008,0.25305
45
+ phi-2,base,Y,3,0.58176,0.28571,0.29494,0.20978,0.26934,0.32831,0.25802,0.56842,0.29439,0.29333,0.14105,0.26842,0.31312,0.23427,0.57509,0.29005,0.29414,0.17542,0.26888,0.32072,0.24614
46
+ bloom-7b1,base,Y,3,0.22694,0.18323,0.25316,0.24036,0.24298,0.22933,0.24550,0.25088,0.23895,0.25158,0.23684,0.24456,0.24456,0.24433,0.23891,0.21109,0.25237,0.23860,0.24377,0.23695,0.24491
47
+ claude-3-opus-20240229,chat,N,0,,,0.70316,0.73330,0.74613,,0.72753,,,,,,,,,,,,,,
eval-results/demo-leaderboard/gpt2-demo/results_2023-11-21T18-10-08.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "model_dtype": "torch.float16",
4
+ "model_name": "demo-leaderboard/gpt2-demo",
5
+ "model_sha": "ac3299b02780836378b9e1e68c6eead546e89f90"
6
+ },
7
+ "results": {
8
+ "task_name1": {
9
+ "metric_name": 0
10
+ },
11
+ "task_name2": {
12
+ "metric_name": 0.90
13
+ }
14
+ }
15
+ }
src/display/about.py CHANGED
@@ -20,11 +20,12 @@ TITLE = """<h1 align="center" id="space-title">📃 SeaExam Leaderboard</h1>"""
20
 
21
  # What does your leaderboard evaluate?
22
  INTRODUCTION_TEXT = """
23
- This leaderboard is specifically designed to evaluate large language models (LLMs) for Southeast Asian (SEA) languages. It assesses model performance using human exam-type benchmarks, reflecting the model's world knowledge (e.g., with language or social science subjects) and reasoning abilities (e.g., with mathematics or natural science subjects).
24
 
25
  For additional details such as datasets, evaluation criteria, and reproducibility, please refer to the "📝 About" tab.
26
 
27
- Also check the [SeaBench leaderboard](https://huggingface.co/spaces/SeaLLMs/SeaBench_leaderboard) - focusing on evaluating the model's ability to follow instructions in real-world multi-turn settings
 
28
  """
29
 
30
  # Which evaluations are you running? how can people reproduce what you have?
 
20
 
21
  # What does your leaderboard evaluate?
22
  INTRODUCTION_TEXT = """
23
+ This leaderboard is specifically designed to evaluate large language models (LLMs) for Southeast Asian (SEA) languages. It assesses model performance using human-exam type benchmarks, reflecting the model's world knowledge (e.g., with language or social science subjects) and reasoning abilities (e.g., with mathematics or natural science subjects).
24
 
25
  For additional details such as datasets, evaluation criteria, and reproducibility, please refer to the "📝 About" tab.
26
 
27
+ Also check the [SeaBench leaderboard](https://huggingface.co/spaces/SeaLLMs/SeaBench_leaderboard) - focusing on evaluating the model's ability to respond to general human instructions in real-world multi-turn settings.
28
+
29
  """
30
 
31
  # Which evaluations are you running? how can people reproduce what you have?
src/leaderboard/load_results.py CHANGED
@@ -25,7 +25,7 @@ def load_data(data_path):
25
  df = pd.read_csv(data_path, skiprows=1, header=0).dropna()
26
 
27
  columns = ['Model', 'type', 'open?', 'shot', 'en', 'zh', 'id', 'th', 'vi', 'avg', 'avg_sea']
28
- columns_sorted = ['rank','type', 'Model', 'avg_sea', 'en', 'zh', 'id', 'th', 'vi', 'avg', 'open?',]
29
 
30
  # Splitting into three separate DataFrames based on the groups M3Exam and MMLU and average
31
  df_m3exam = df.iloc[:, :11] # M3Exam columns
@@ -60,9 +60,9 @@ def load_data(data_path):
60
  df_avg = df_avg.rename(columns={'avg_sea': 'avg_sea ⬇️'})
61
 
62
  # map the values in the 'type' column to the following values: {'base': 'Base', 'chat': 'Chat'}
63
- df_m3exam['type'] = df_m3exam['type'].map({'base': '🟢base', 'chat': '🔶chat'})
64
- df_mmlu['type'] = df_mmlu['type'].map({'base': '🟢base', 'chat': '🔶chat'})
65
- df_avg['type'] = df_avg['type'].map({'base': '🟢base', 'chat': '🔶chat'})
66
 
67
  return df_m3exam, df_mmlu, df_avg
68
 
 
25
  df = pd.read_csv(data_path, skiprows=1, header=0).dropna()
26
 
27
  columns = ['Model', 'type', 'open?', 'shot', 'en', 'zh', 'id', 'th', 'vi', 'avg', 'avg_sea']
28
+ columns_sorted = ['rank','type', 'Model', 'open?', 'avg_sea', 'en', 'zh', 'id', 'th', 'vi', 'avg']
29
 
30
  # Splitting into three separate DataFrames based on the groups M3Exam and MMLU and average
31
  df_m3exam = df.iloc[:, :11] # M3Exam columns
 
60
  df_avg = df_avg.rename(columns={'avg_sea': 'avg_sea ⬇️'})
61
 
62
  # map the values in the 'type' column to the following values: {'base': 'Base', 'chat': 'Chat'}
63
+ df_m3exam['type'] = df_m3exam['type'].map({'base': '🟢 base', 'chat': '🔶 chat'})
64
+ df_mmlu['type'] = df_mmlu['type'].map({'base': '🟢 base', 'chat': '🔶 chat'})
65
+ df_avg['type'] = df_avg['type'].map({'base': '🟢 base', 'chat': '🔶 chat'})
66
 
67
  return df_m3exam, df_mmlu, df_avg
68