zhuohan-7's picture
Upload folder using huggingface_hub
917eff6 verified
import streamlit as st
from app.draw_diagram import *
def dashboard():
with st.container():
st.title("SeaEval")
st.markdown("""
[gh]: https://github.com/SeaEval/SeaEval
[![GitHub Repo stars](https://img.shields.io/github/stars/SeaEval/SeaEval?style=social)][gh]
""")
st.markdown("""
### Changelog
- **Dec 2024**:
- Updated results for **Cross-MMLU**, **Cross-LogiQA**, **Cross-XQuad**, **MMLU**, **IndoMMLU**, and **SG-Eval-v2** with new prompts (simple prompts to encourage reasoning).
- Added new models: **SEA-LION v3**, **Gemma-2**, and **Sailor 2**.
- **Nov 2024**:
- Updated layout and added support for comparison between models with similar sizes.
""")
st.divider()
st.markdown("#### What is [SeaEval](https://seaeval.github.io/)?")
with st.container():
left_co, cent_co,last_co = st.columns(3)
with cent_co:
st.image("./style/seaeval_overall.png",
# caption="SeaEval data range",
width=500)
st.markdown('''
''')
st.markdown("##### A benchmark for multilingual, multicultral foundation model evaluation consisting of >30 dataset and we are keep expanding over time.")
st.markdown(''':star: How models understand and reason with natural language?
:balloon: Languages: English, Chinese, Malay, Spainish, Indonedian, Vietnamese, Filipino.
''')
st.markdown(''':star: How models comprehend cultural practices, nuances and values?
:balloon: 4 new datasets on Cultural Understanding.
''')
st.markdown(''':star: How models perform across languages in terms of consistency?
:balloon: 2 new datasets with curated metrics for Cross-Linugal Consistency.
''')
with st.container():
left_co, cent_co,last_co = st.columns(3)
with cent_co:
st.image("./style/consistency.png",
# caption="SeaEval data range",
width=500)
st.markdown("##### Evaluation with enhanced cross-lingual capabilities.")
st.markdown(''':star: How models perform according to different (paraphrased) instructions?
:balloon: Each dataset is equipped with 5 different prompts to avoid randomness introduced by instructions,
which is non-negligible..
''')
st.markdown(''':star: Multilingual accuracy and performance consistency across languages.
:balloon: If you can answer the question in your native language, can you answer the same question
correctly in your second/third language?
''')
st.divider()
with st.container():
st.markdown("##### Citations")
st.markdown('''
```
@article{SeaEval,
title={SeaEval for Multilingual Foundation Models: From Cross-Lingual Alignment to Cultural Reasoning},
author={Wang, Bin and Liu, Zhengyuan and Huang, Xin and Jiao, Fangkai and Ding, Yang and Aw, Ai Ti and Chen, Nancy F.},
journal={NAACL},
year={2024}
}
```
''')
def cross_lingual_consistency():
st.title("Task: Cross-Lingual Consistency")
filters_levelone = ['Zero Shot', 'Few Shot']
filters_leveltwo = [
'Cross-MMLU',
'Cross-XQUAD',
'Cross-LogiQA',
]
category_one_dict = {
'Zero Shot': 'zero_shot',
'Few Shot' : 'few_shot'
}
category_two_dict = {
'Cross-MMLU' : 'cross_mmlu_no_prompt',
'Cross-XQUAD' : 'cross_xquad_no_prompt',
'Cross-LogiQA' : 'cross_logiqa_no_prompt',
}
left, center, middle, _, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
with left:
category_one = st.selectbox('Zero or Few Shot', filters_levelone)
with center:
category_two = st.selectbox('Dataset', filters_leveltwo)
with middle:
model_size_range = st.selectbox('Model Size', ['All', '<10B', '10B-30B', '>30B'])
with right:
sort = st.selectbox('Sort (For Chart)', ['Accuracy','Cross-Lingual Consistency', 'AC3',
'English', 'Chinese', 'Spanish', 'Vietnamese'])
sortby = 'Ascending'
if category_one or category_two or sort or sortby:
category_one = category_one_dict[category_one]
category_two = category_two_dict[category_two]
draw('cross_lingual', category_one, category_two, sort, sortby, model_size_range)
def cultural_reasoning():
st.title("Task: Cultural Reasoning")
filters_levelone = ['Zero Shot', 'Few Shot']
filters_leveltwo = [
'SG-EVAL-v2-MCQ',
'SG-EVAL-v2-Open-Ended',
'SG-EVAL-v1-Cleaned',
'SG-EVAL-v1',
'CN-EVAL',
'PH-EVAL',
'US-EVAL'
]
category_one_dict = {'Zero Shot': 'zero_shot',
'Few Shot': 'few_shot'
}
category_two_dict = {
'SG-EVAL-v2-MCQ' : 'sg_eval_v2_mcq_no_prompt',
'SG-EVAL-v1' : 'sg_eval',
'SG-EVAL-v1-Cleaned' : 'sg_eval_v1_cleaned',
'SG-EVAL-v2-Open-Ended' : 'sg_eval_v2_open',
'US-EVAL' : 'us_eval',
'CN-EVAL' : 'cn_eval',
'PH-EVAL' : 'ph_eval'
}
left, center, middle, _, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
with left:
category_one = st.selectbox('Zero or Few Shot', filters_levelone)
with center:
category_two = st.selectbox('Dataset', filters_leveltwo)
with middle:
model_size_range = st.selectbox('Model Size', ['All', '<10B', '10B-30B', '>30B'])
sortby = 'Ascending'
if category_one or category_two or sortby:
category_one = category_one_dict[category_one]
category_two = category_two_dict[category_two]
draw('cultural_reasoning', category_one, category_two, 'Accuracy', sortby, model_size_range)
def general_reasoning():
st.title("Task: General Reasoning")
filters_levelone = ['Zero Shot', 'Few Shot']
filters_leveltwo = [
'IndoMMLU',
'MMLU',
'CMMLU',
'C-Eval',
'ZBench',
]
category_one_dict = {
'Zero Shot': 'zero_shot',
'Few Shot' : 'few_shot'
}
category_two_dict = {
'IndoMMLU': 'indommlu_no_prompt',
'MMLU' : 'mmlu_no_prompt',
'C-Eval' : 'c_eval',
'CMMLU' : 'cmmlu',
'ZBench' : 'zbench',
}
left, center, middle, _, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
with left:
category_one = st.selectbox('Zero or Few Shot', filters_levelone)
with center:
category_two = st.selectbox('Dataset', filters_leveltwo)
with middle:
model_size_range = st.selectbox('Model Size', ['All', '<10B', '10B-30B', '>30B'])
sortby = 'Ascending'
if category_one or category_two or sortby:
category_one = category_one_dict[category_one]
category_two = category_two_dict[category_two]
draw('general_reasoning', category_one, category_two, 'Accuracy', sortby, model_size_range)
def flores():
st.title("Task: FLORES-Translation")
filters_levelone = ['Zero Shot', 'Few Shot']
filters_leveltwo = [
'Indonesian to English',
'Vitenamese to English',
'Chinese to English',
'Malay to English'
]
category_one_dict = {
'Zero Shot': 'zero_shot',
'Few Shot' : 'few_shot'
}
category_two_dict = {
'Indonesian to English': 'ind2eng',
'Vitenamese to English': 'vie2eng',
'Chinese to English' : 'zho2eng',
'Malay to English' : 'zsm2eng'
}
left, center, middle, _, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
with left:
category_one = st.selectbox('Zero or Few Shot', filters_levelone)
with center:
category_two = st.selectbox('Dataset', filters_leveltwo)
with middle:
model_size_range = st.selectbox('Model Size', ['All', '<10B', '10B-30B', '>30B'])
sortby = 'Ascending'
if category_one or category_two or sortby:
category_one = category_one_dict[category_one]
category_two = category_two_dict[category_two]
draw('flores_translation', category_one, category_two, 'BLEU', sortby, model_size_range)
def emotion():
st.title("Task: Emotion")
filters_levelone = ['Zero Shot', 'Few Shot']
filters_leveltwo = [
'Indonesian Emotion Classification',
'SST2',
]
category_one_dict = {'Zero Shot': 'zero_shot',
'Few Shot': 'few_shot'}
category_two_dict = {
'Indonesian Emotion Classification': 'ind_emotion',
'SST2' : 'sst2'
}
left, center, middle, _, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
with left:
category_one = st.selectbox('Zero or Few Shot', filters_levelone)
with center:
category_two = st.selectbox('Dataset', filters_leveltwo)
with middle:
model_size_range = st.selectbox('Model Size', ['All', '<10B', '10B-30B', '>30B'])
sortby = 'Ascending'
if category_one or category_two or sortby:
category_one = category_one_dict[category_one]
category_two = category_two_dict[category_two]
draw('emotion', category_one, category_two, 'Accuracy', sortby, model_size_range)
def dialogue():
st.title("Task: Dialogue")
filters_levelone = ['Zero Shot', 'Few Shot']
filters_leveltwo = [
'DREAM',
'SAMSum',
'DialogSum',
]
category_one_dict = {
'Zero Shot': 'zero_shot',
'Few Shot' : 'few_shot'
}
category_two_dict = {
'DREAM' : 'dream',
'SAMSum' : 'samsum',
'DialogSum': 'dialogsum'
}
left, center, middle, _, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
with left:
category_one = st.selectbox('Zero or Few Shot', filters_levelone)
with center:
category_two = st.selectbox('Dataset', filters_leveltwo)
with right:
if category_two == 'DREAM':
sort = st.selectbox('Sort', ['Accuracy'])
else:
sort = st.selectbox('Sort', ['Average', 'ROUGE-1', 'ROUGE-2', 'ROUGE-L'])
with middle:
model_size_range = st.selectbox('Model Size', ['All', '<10B', '10B-30B', '>30B'])
sortby = 'Ascending'
if category_one or category_two or sort or sortby:
category_one = category_one_dict[category_one]
category_two = category_two_dict[category_two]
draw('dialogue', category_one, category_two, sort, sortby, model_size_range)
def fundamental_nlp_tasks():
st.title("Task: Fundamental NLP Tasks")
filters_levelone = ['Zero Shot', 'Few Shot']
filters_leveltwo = ['OCNLI', 'C3', 'COLA', 'QQP', 'MNLI', 'QNLI', 'WNLI', 'RTE', 'MRPC']
category_one_dict = {
'Zero Shot': 'zero_shot',
'Few Shot' : 'few_shot'
}
category_two_dict = {
'OCNLI': 'ocnli',
'C3' : 'c3',
'COLA' : 'cola',
'QQP' : 'qqp',
'MNLI' : 'mnli',
'QNLI' : 'qnli',
'WNLI' : 'wnli',
'RTE' : 'rte',
'MRPC' : 'mrpc'
}
left, center, middle, _, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
with left:
category_one = st.selectbox('Zero or Few Shot', filters_levelone)
with center:
category_two = st.selectbox('Dataset', filters_leveltwo)
with middle:
model_size_range = st.selectbox('Model Size', ['All', '<10B', '10B-30B', '>30B'])
sortby = 'Ascending'
if category_one or category_two or sortby:
category_one = category_one_dict[category_one]
category_two = category_two_dict[category_two]
draw('fundamental_nlp_tasks', category_one, category_two, 'Accuracy', sortby, model_size_range)