Spaces:
Running
Running
update the interface
Browse files- src/display/about.py +11 -1
- src/leaderboard/load_results.py +8 -8
src/display/about.py
CHANGED
@@ -29,7 +29,17 @@ Also check the [SeaBench leaderboard](https://huggingface.co/spaces/SeaLLMs/SeaB
|
|
29 |
|
30 |
# Which evaluations are you running? how can people reproduce what you have?
|
31 |
LLM_BENCHMARKS_TEXT = f"""
|
32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
|
34 |
## Reproducibility
|
35 |
To reproduce our results, here is the commands you can run:
|
|
|
29 |
|
30 |
# Which evaluations are you running? how can people reproduce what you have?
|
31 |
LLM_BENCHMARKS_TEXT = f"""
|
32 |
+
# About
|
33 |
+
Even though large language models (LLMs) have shown impressive performance on various benchmarks for English, their performance on Southeast Asian (SEA) languages is still underexplored. This leaderboard aims to evaluate LLMs on exam-type benchmarks for SEA languages, focusing on world knowledge and reasoning abilities.
|
34 |
+
|
35 |
+
## Datasets
|
36 |
+
The leaderboard evaluates models on the following tasks:
|
37 |
+
- **M3Exam**:
|
38 |
+
- **MMLU**:
|
39 |
+
|
40 |
+
## Evalation Criteria
|
41 |
+
|
42 |
+
## Reults
|
43 |
|
44 |
## Reproducibility
|
45 |
To reproduce our results, here is the commands you can run:
|
src/leaderboard/load_results.py
CHANGED
@@ -25,7 +25,7 @@ def load_data(data_path):
|
|
25 |
df = pd.read_csv(data_path, skiprows=1, header=0).dropna()
|
26 |
|
27 |
columns = ['Model', 'type', 'open?', 'shot', 'en', 'zh', 'id', 'th', 'vi', 'avg', 'avg_sea']
|
28 |
-
columns_sorted = ['rank','type', 'Model',
|
29 |
|
30 |
# Splitting into three separate DataFrames based on the groups M3Exam and MMLU and average
|
31 |
df_m3exam = df.iloc[:, :11] # M3Exam columns
|
@@ -54,15 +54,15 @@ def load_data(data_path):
|
|
54 |
df_mmlu = df_mmlu.sort_values(by='avg_sea', ascending=False)
|
55 |
df_avg = df_avg.sort_values(by='avg_sea', ascending=False)
|
56 |
|
57 |
-
# change the column name from 'avg_sea' to 'avg_sea
|
58 |
-
df_m3exam = df_m3exam.rename(columns={'avg_sea': 'avg_sea
|
59 |
-
df_mmlu = df_mmlu.rename(columns={'avg_sea': 'avg_sea
|
60 |
-
df_avg = df_avg.rename(columns={'avg_sea': 'avg_sea
|
61 |
|
62 |
# map the values in the 'type' column to the following values: {'base': 'Base', 'chat': 'Chat'}
|
63 |
-
df_m3exam['type'] = df_m3exam['type'].map({'base': '🟢', 'chat': '🔶'})
|
64 |
-
df_mmlu['type'] = df_mmlu['type'].map({'base': '🟢', 'chat': '🔶'})
|
65 |
-
df_avg['type'] = df_avg['type'].map({'base': '🟢', 'chat': '🔶'})
|
66 |
|
67 |
return df_m3exam, df_mmlu, df_avg
|
68 |
|
|
|
25 |
df = pd.read_csv(data_path, skiprows=1, header=0).dropna()
|
26 |
|
27 |
columns = ['Model', 'type', 'open?', 'shot', 'en', 'zh', 'id', 'th', 'vi', 'avg', 'avg_sea']
|
28 |
+
columns_sorted = ['rank','type', 'Model', 'avg_sea', 'en', 'zh', 'id', 'th', 'vi', 'avg', 'open?',]
|
29 |
|
30 |
# Splitting into three separate DataFrames based on the groups M3Exam and MMLU and average
|
31 |
df_m3exam = df.iloc[:, :11] # M3Exam columns
|
|
|
54 |
df_mmlu = df_mmlu.sort_values(by='avg_sea', ascending=False)
|
55 |
df_avg = df_avg.sort_values(by='avg_sea', ascending=False)
|
56 |
|
57 |
+
# change the column name from 'avg_sea' to 'avg_sea ⬇️'
|
58 |
+
df_m3exam = df_m3exam.rename(columns={'avg_sea': 'avg_sea ⬇️'})
|
59 |
+
df_mmlu = df_mmlu.rename(columns={'avg_sea': 'avg_sea ⬇️'})
|
60 |
+
df_avg = df_avg.rename(columns={'avg_sea': 'avg_sea ⬇️'})
|
61 |
|
62 |
# map the values in the 'type' column to the following values: {'base': 'Base', 'chat': 'Chat'}
|
63 |
+
df_m3exam['type'] = df_m3exam['type'].map({'base': '🟢base', 'chat': '🔶chat'})
|
64 |
+
df_mmlu['type'] = df_mmlu['type'].map({'base': '🟢base', 'chat': '🔶chat'})
|
65 |
+
df_avg['type'] = df_avg['type'].map({'base': '🟢base', 'chat': '🔶chat'})
|
66 |
|
67 |
return df_m3exam, df_mmlu, df_avg
|
68 |
|