zhuohan-7 commited on
Commit
917eff6
1 Parent(s): ed4682d

Upload folder using huggingface_hub

Browse files
app/__pycache__/draw_diagram.cpython-310.pyc CHANGED
Binary files a/app/__pycache__/draw_diagram.cpython-310.pyc and b/app/__pycache__/draw_diagram.cpython-310.pyc differ
 
app/__pycache__/pages.cpython-310.pyc CHANGED
Binary files a/app/__pycache__/pages.cpython-310.pyc and b/app/__pycache__/pages.cpython-310.pyc differ
 
app/draw_diagram.py CHANGED
@@ -104,7 +104,11 @@ def draw(folder_name, category_one, category_two, sort, num_sort, model_size_ran
104
  ascending=False
105
  ).reset_index(drop=True)
106
 
107
- styled_df = chart_data_table.style.highlight_max(
 
 
 
 
108
  subset=[chart_data_table.columns[2]], color='yellow'
109
  )
110
 
 
104
  ascending=False
105
  ).reset_index(drop=True)
106
 
107
+ styled_df = chart_data_table.style.format(
108
+ {
109
+ chart_data_table.columns[i]: "{:.3f}" for i in range(2, len(chart_data_table.columns))
110
+ }
111
+ ).highlight_max(
112
  subset=[chart_data_table.columns[2]], color='yellow'
113
  )
114
 
app/pages.py CHANGED
@@ -11,15 +11,21 @@ def dashboard():
11
  [![GitHub Repo stars](https://img.shields.io/github/stars/SeaEval/SeaEval?style=social)][gh]
12
  """)
13
 
14
- st.markdown("#### News")
15
- st.markdown("Dec, 2024: Update Cross-MMLU, Cross-LogiQA, Cross-XQuad, MMLU, IndoMMLU, SG-Eval-v2 results with new prompts (simple prompts to encourage reasoning).")
16
- st.markdown("Dec, 2024: New models added: SEA-LION v3, Gemma-2, Sailor 2")
17
- st.markdown("Nov, 2024: Update layout and support comparison between models with similar model sizes.")
 
 
 
 
 
 
 
18
 
19
  st.divider()
20
 
21
- seaeval_url = "https://seaeval.github.io/"
22
- st.markdown("#### What is [SeaEval](%s)?" % seaeval_url)
23
 
24
  with st.container():
25
  left_co, cent_co,last_co = st.columns(3)
@@ -64,13 +70,14 @@ def dashboard():
64
  st.markdown("##### Citations")
65
 
66
  st.markdown('''
67
- :round_pushpin: SeaEval Paper \n
68
  @article{SeaEval,
69
  title={SeaEval for Multilingual Foundation Models: From Cross-Lingual Alignment to Cultural Reasoning},
70
  author={Wang, Bin and Liu, Zhengyuan and Huang, Xin and Jiao, Fangkai and Ding, Yang and Aw, Ai Ti and Chen, Nancy F.},
71
  journal={NAACL},
72
  year={2024}
73
  }
 
74
  ''')
75
 
76
 
@@ -80,11 +87,8 @@ def cross_lingual_consistency():
80
  filters_levelone = ['Zero Shot', 'Few Shot']
81
  filters_leveltwo = [
82
  'Cross-MMLU',
83
- #'Cross-MMLU-No-Prompt',
84
  'Cross-XQUAD',
85
- #'Cross-XQUAD-No-Prompt',
86
  'Cross-LogiQA',
87
- #'Cross-LogiQA-No-Prompt',
88
  ]
89
 
90
  category_one_dict = {
@@ -94,11 +98,8 @@ def cross_lingual_consistency():
94
 
95
  category_two_dict = {
96
  'Cross-MMLU' : 'cross_mmlu_no_prompt',
97
- #'Cross-MMLU-No-Prompt' : 'cross_mmlu_no_prompt',
98
  'Cross-XQUAD' : 'cross_xquad_no_prompt',
99
- #'Cross-XQUAD-No-Prompt' : 'cross_xquad_no_prompt',
100
  'Cross-LogiQA' : 'cross_logiqa_no_prompt',
101
- #'Cross-LogiQA-No-Prompt': 'cross_logiqa_no_prompt',
102
  }
103
 
104
  left, center, middle, _, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
@@ -128,7 +129,6 @@ def cultural_reasoning():
128
  filters_levelone = ['Zero Shot', 'Few Shot']
129
  filters_leveltwo = [
130
  'SG-EVAL-v2-MCQ',
131
- #'SG EVAL V2 MCQ No Prompt',
132
  'SG-EVAL-v2-Open-Ended',
133
  'SG-EVAL-v1-Cleaned',
134
  'SG-EVAL-v1',
@@ -145,7 +145,6 @@ def cultural_reasoning():
145
  'SG-EVAL-v2-MCQ' : 'sg_eval_v2_mcq_no_prompt',
146
  'SG-EVAL-v1' : 'sg_eval',
147
  'SG-EVAL-v1-Cleaned' : 'sg_eval_v1_cleaned',
148
- # 'SG EVAL V2 MCQ No Prompt': 'sg_eval_v2_mcq_no_prompt',
149
  'SG-EVAL-v2-Open-Ended' : 'sg_eval_v2_open',
150
  'US-EVAL' : 'us_eval',
151
  'CN-EVAL' : 'cn_eval',
@@ -175,24 +174,22 @@ def general_reasoning():
175
  filters_leveltwo = [
176
  'IndoMMLU',
177
  'MMLU',
178
- #'MMLU-No-Prompt',
179
  'CMMLU',
180
- #'IndoMMLU-No-Prompt',
181
  'C-Eval',
182
  'ZBench',
183
  ]
184
 
185
- category_one_dict = {'Zero Shot': 'zero_shot',
186
- 'Few Shot': 'few_shot'}
 
 
187
 
188
  category_two_dict = {
189
  'IndoMMLU': 'indommlu_no_prompt',
190
- 'MMLU': 'mmlu_no_prompt',
191
- #'MMLU-No-Prompt': 'mmlu_no_prompt',
192
- 'C-Eval': 'c_eval',
193
- 'CMMLU': 'cmmlu',
194
- 'ZBench': 'zbench',
195
- #'IndoMMLU-No-Prompt': 'indommlu_no_prompt',
196
  }
197
 
198
  left, center, middle, _, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
@@ -215,18 +212,23 @@ def flores():
215
  st.title("Task: FLORES-Translation")
216
 
217
  filters_levelone = ['Zero Shot', 'Few Shot']
218
- filters_leveltwo = ['Indonesian to English',
 
219
  'Vitenamese to English',
220
  'Chinese to English',
221
  'Malay to English'
222
  ]
223
 
224
- category_one_dict = {'Zero Shot': 'zero_shot',
225
- 'Few Shot': 'few_shot'}
226
- category_two_dict = {'Indonesian to English': 'ind2eng',
227
- 'Vitenamese to English': 'vie2eng',
228
- 'Chinese to English': 'zho2eng',
229
- 'Malay to English': 'zsm2eng'}
 
 
 
 
230
 
231
 
232
  left, center, middle, _, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
@@ -256,8 +258,10 @@ def emotion():
256
 
257
  category_one_dict = {'Zero Shot': 'zero_shot',
258
  'Few Shot': 'few_shot'}
259
- category_two_dict = {'Indonesian Emotion Classification': 'ind_emotion',
260
- 'SST2': 'sst2'}
 
 
261
 
262
  left, center, middle, _, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
263
  with left:
@@ -285,11 +289,15 @@ def dialogue():
285
  'DialogSum',
286
  ]
287
 
288
- category_one_dict = {'Zero Shot': 'zero_shot',
289
- 'Few Shot': 'few_shot'}
290
- category_two_dict = {'DREAM': 'dream',
291
- 'SAMSum': 'samsum',
292
- 'DialogSum': 'dialogsum'}
 
 
 
 
293
 
294
  left, center, middle, _, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
295
  with left:
@@ -319,17 +327,21 @@ def fundamental_nlp_tasks():
319
  filters_levelone = ['Zero Shot', 'Few Shot']
320
  filters_leveltwo = ['OCNLI', 'C3', 'COLA', 'QQP', 'MNLI', 'QNLI', 'WNLI', 'RTE', 'MRPC']
321
 
322
- category_one_dict = {'Zero Shot': 'zero_shot',
323
- 'Few Shot': 'few_shot'}
324
- category_two_dict = {'OCNLI': 'ocnli',
325
- 'C3': 'c3',
326
- 'COLA': 'cola',
327
- 'QQP': 'qqp',
328
- 'MNLI': 'mnli',
329
- 'QNLI': 'qnli',
330
- 'WNLI': 'wnli',
331
- 'RTE': 'rte',
332
- 'MRPC': 'mrpc'}
 
 
 
 
333
 
334
  left, center, middle, _, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
335
  with left:
 
11
  [![GitHub Repo stars](https://img.shields.io/github/stars/SeaEval/SeaEval?style=social)][gh]
12
  """)
13
 
14
+
15
+ st.markdown("""
16
+ ### Changelog
17
+
18
+ - **Dec 2024**:
19
+ - Updated results for **Cross-MMLU**, **Cross-LogiQA**, **Cross-XQuad**, **MMLU**, **IndoMMLU**, and **SG-Eval-v2** with new prompts (simple prompts to encourage reasoning).
20
+ - Added new models: **SEA-LION v3**, **Gemma-2**, and **Sailor 2**.
21
+
22
+ - **Nov 2024**:
23
+ - Updated layout and added support for comparison between models with similar sizes.
24
+ """)
25
 
26
  st.divider()
27
 
28
+ st.markdown("#### What is [SeaEval](https://seaeval.github.io/)?")
 
29
 
30
  with st.container():
31
  left_co, cent_co,last_co = st.columns(3)
 
70
  st.markdown("##### Citations")
71
 
72
  st.markdown('''
73
+ ```
74
  @article{SeaEval,
75
  title={SeaEval for Multilingual Foundation Models: From Cross-Lingual Alignment to Cultural Reasoning},
76
  author={Wang, Bin and Liu, Zhengyuan and Huang, Xin and Jiao, Fangkai and Ding, Yang and Aw, Ai Ti and Chen, Nancy F.},
77
  journal={NAACL},
78
  year={2024}
79
  }
80
+ ```
81
  ''')
82
 
83
 
 
87
  filters_levelone = ['Zero Shot', 'Few Shot']
88
  filters_leveltwo = [
89
  'Cross-MMLU',
 
90
  'Cross-XQUAD',
 
91
  'Cross-LogiQA',
 
92
  ]
93
 
94
  category_one_dict = {
 
98
 
99
  category_two_dict = {
100
  'Cross-MMLU' : 'cross_mmlu_no_prompt',
 
101
  'Cross-XQUAD' : 'cross_xquad_no_prompt',
 
102
  'Cross-LogiQA' : 'cross_logiqa_no_prompt',
 
103
  }
104
 
105
  left, center, middle, _, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
 
129
  filters_levelone = ['Zero Shot', 'Few Shot']
130
  filters_leveltwo = [
131
  'SG-EVAL-v2-MCQ',
 
132
  'SG-EVAL-v2-Open-Ended',
133
  'SG-EVAL-v1-Cleaned',
134
  'SG-EVAL-v1',
 
145
  'SG-EVAL-v2-MCQ' : 'sg_eval_v2_mcq_no_prompt',
146
  'SG-EVAL-v1' : 'sg_eval',
147
  'SG-EVAL-v1-Cleaned' : 'sg_eval_v1_cleaned',
 
148
  'SG-EVAL-v2-Open-Ended' : 'sg_eval_v2_open',
149
  'US-EVAL' : 'us_eval',
150
  'CN-EVAL' : 'cn_eval',
 
174
  filters_leveltwo = [
175
  'IndoMMLU',
176
  'MMLU',
 
177
  'CMMLU',
 
178
  'C-Eval',
179
  'ZBench',
180
  ]
181
 
182
+ category_one_dict = {
183
+ 'Zero Shot': 'zero_shot',
184
+ 'Few Shot' : 'few_shot'
185
+ }
186
 
187
  category_two_dict = {
188
  'IndoMMLU': 'indommlu_no_prompt',
189
+ 'MMLU' : 'mmlu_no_prompt',
190
+ 'C-Eval' : 'c_eval',
191
+ 'CMMLU' : 'cmmlu',
192
+ 'ZBench' : 'zbench',
 
 
193
  }
194
 
195
  left, center, middle, _, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
 
212
  st.title("Task: FLORES-Translation")
213
 
214
  filters_levelone = ['Zero Shot', 'Few Shot']
215
+ filters_leveltwo = [
216
+ 'Indonesian to English',
217
  'Vitenamese to English',
218
  'Chinese to English',
219
  'Malay to English'
220
  ]
221
 
222
+ category_one_dict = {
223
+ 'Zero Shot': 'zero_shot',
224
+ 'Few Shot' : 'few_shot'
225
+ }
226
+ category_two_dict = {
227
+ 'Indonesian to English': 'ind2eng',
228
+ 'Vitenamese to English': 'vie2eng',
229
+ 'Chinese to English' : 'zho2eng',
230
+ 'Malay to English' : 'zsm2eng'
231
+ }
232
 
233
 
234
  left, center, middle, _, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
 
258
 
259
  category_one_dict = {'Zero Shot': 'zero_shot',
260
  'Few Shot': 'few_shot'}
261
+ category_two_dict = {
262
+ 'Indonesian Emotion Classification': 'ind_emotion',
263
+ 'SST2' : 'sst2'
264
+ }
265
 
266
  left, center, middle, _, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
267
  with left:
 
289
  'DialogSum',
290
  ]
291
 
292
+ category_one_dict = {
293
+ 'Zero Shot': 'zero_shot',
294
+ 'Few Shot' : 'few_shot'
295
+ }
296
+ category_two_dict = {
297
+ 'DREAM' : 'dream',
298
+ 'SAMSum' : 'samsum',
299
+ 'DialogSum': 'dialogsum'
300
+ }
301
 
302
  left, center, middle, _, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
303
  with left:
 
327
  filters_levelone = ['Zero Shot', 'Few Shot']
328
  filters_leveltwo = ['OCNLI', 'C3', 'COLA', 'QQP', 'MNLI', 'QNLI', 'WNLI', 'RTE', 'MRPC']
329
 
330
+ category_one_dict = {
331
+ 'Zero Shot': 'zero_shot',
332
+ 'Few Shot' : 'few_shot'
333
+ }
334
+ category_two_dict = {
335
+ 'OCNLI': 'ocnli',
336
+ 'C3' : 'c3',
337
+ 'COLA' : 'cola',
338
+ 'QQP' : 'qqp',
339
+ 'MNLI' : 'mnli',
340
+ 'QNLI' : 'qnli',
341
+ 'WNLI' : 'wnli',
342
+ 'RTE' : 'rte',
343
+ 'MRPC' : 'mrpc'
344
+ }
345
 
346
  left, center, middle, _, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
347
  with left: