import streamlit as st from app.draw_diagram import * def dashboard(): with st.container(): st.title("SeaEval") st.markdown(""" [gh]: https://github.com/SeaEval/SeaEval [![GitHub Repo stars](https://img.shields.io/github/stars/SeaEval/SeaEval?style=social)][gh] """) st.markdown(""" ### Changelog - **Dec 2024**: - Updated results for **Cross-MMLU**, **Cross-LogiQA**, **Cross-XQuad**, **MMLU**, **IndoMMLU**, and **SG-Eval-v2** with new prompts (simple prompts to encourage reasoning). - Added new models: **SEA-LION v3**, **Gemma-2**, and **Sailor 2**. - **Nov 2024**: - Updated layout and added support for comparison between models with similar sizes. """) st.divider() st.markdown("#### What is [SeaEval](https://seaeval.github.io/)?") with st.container(): left_co, cent_co,last_co = st.columns(3) with cent_co: st.image("./style/seaeval_overall.png", # caption="SeaEval data range", width=500) st.markdown(''' ''') st.markdown("##### A benchmark for multilingual, multicultral foundation model evaluation consisting of >30 dataset and we are keep expanding over time.") st.markdown(''':star: How models understand and reason with natural language? :balloon: Languages: English, Chinese, Malay, Spainish, Indonedian, Vietnamese, Filipino. ''') st.markdown(''':star: How models comprehend cultural practices, nuances and values? :balloon: 4 new datasets on Cultural Understanding. ''') st.markdown(''':star: How models perform across languages in terms of consistency? :balloon: 2 new datasets with curated metrics for Cross-Linugal Consistency. ''') with st.container(): left_co, cent_co,last_co = st.columns(3) with cent_co: st.image("./style/consistency.png", # caption="SeaEval data range", width=500) st.markdown("##### Evaluation with enhanced cross-lingual capabilities.") st.markdown(''':star: How models perform according to different (paraphrased) instructions? :balloon: Each dataset is equipped with 5 different prompts to avoid randomness introduced by instructions, which is non-negligible.. ''') st.markdown(''':star: Multilingual accuracy and performance consistency across languages. :balloon: If you can answer the question in your native language, can you answer the same question correctly in your second/third language? ''') st.divider() with st.container(): st.markdown("##### Citations") st.markdown(''' ``` @article{SeaEval, title={SeaEval for Multilingual Foundation Models: From Cross-Lingual Alignment to Cultural Reasoning}, author={Wang, Bin and Liu, Zhengyuan and Huang, Xin and Jiao, Fangkai and Ding, Yang and Aw, Ai Ti and Chen, Nancy F.}, journal={NAACL}, year={2024} } ``` ''') def cross_lingual_consistency(): st.title("Task: Cross-Lingual Consistency") filters_levelone = ['Zero Shot', 'Few Shot'] filters_leveltwo = [ 'Cross-MMLU', 'Cross-XQUAD', 'Cross-LogiQA', ] category_one_dict = { 'Zero Shot': 'zero_shot', 'Few Shot' : 'few_shot' } category_two_dict = { 'Cross-MMLU' : 'cross_mmlu_no_prompt', 'Cross-XQUAD' : 'cross_xquad_no_prompt', 'Cross-LogiQA' : 'cross_logiqa_no_prompt', } left, center, middle, _, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2]) with left: category_one = st.selectbox('Zero or Few Shot', filters_levelone) with center: category_two = st.selectbox('Dataset', filters_leveltwo) with middle: model_size_range = st.selectbox('Model Size', ['All', '<10B', '10B-30B', '>30B']) with right: sort = st.selectbox('Sort (For Chart)', ['Accuracy','Cross-Lingual Consistency', 'AC3', 'English', 'Chinese', 'Spanish', 'Vietnamese']) sortby = 'Ascending' if category_one or category_two or sort or sortby: category_one = category_one_dict[category_one] category_two = category_two_dict[category_two] draw('cross_lingual', category_one, category_two, sort, sortby, model_size_range) def cultural_reasoning(): st.title("Task: Cultural Reasoning") filters_levelone = ['Zero Shot', 'Few Shot'] filters_leveltwo = [ 'SG-EVAL-v2-MCQ', 'SG-EVAL-v2-Open-Ended', 'SG-EVAL-v1-Cleaned', 'SG-EVAL-v1', 'CN-EVAL', 'PH-EVAL', 'US-EVAL' ] category_one_dict = {'Zero Shot': 'zero_shot', 'Few Shot': 'few_shot' } category_two_dict = { 'SG-EVAL-v2-MCQ' : 'sg_eval_v2_mcq_no_prompt', 'SG-EVAL-v1' : 'sg_eval', 'SG-EVAL-v1-Cleaned' : 'sg_eval_v1_cleaned', 'SG-EVAL-v2-Open-Ended' : 'sg_eval_v2_open', 'US-EVAL' : 'us_eval', 'CN-EVAL' : 'cn_eval', 'PH-EVAL' : 'ph_eval' } left, center, middle, _, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2]) with left: category_one = st.selectbox('Zero or Few Shot', filters_levelone) with center: category_two = st.selectbox('Dataset', filters_leveltwo) with middle: model_size_range = st.selectbox('Model Size', ['All', '<10B', '10B-30B', '>30B']) sortby = 'Ascending' if category_one or category_two or sortby: category_one = category_one_dict[category_one] category_two = category_two_dict[category_two] draw('cultural_reasoning', category_one, category_two, 'Accuracy', sortby, model_size_range) def general_reasoning(): st.title("Task: General Reasoning") filters_levelone = ['Zero Shot', 'Few Shot'] filters_leveltwo = [ 'IndoMMLU', 'MMLU', 'CMMLU', 'C-Eval', 'ZBench', ] category_one_dict = { 'Zero Shot': 'zero_shot', 'Few Shot' : 'few_shot' } category_two_dict = { 'IndoMMLU': 'indommlu_no_prompt', 'MMLU' : 'mmlu_no_prompt', 'C-Eval' : 'c_eval', 'CMMLU' : 'cmmlu', 'ZBench' : 'zbench', } left, center, middle, _, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2]) with left: category_one = st.selectbox('Zero or Few Shot', filters_levelone) with center: category_two = st.selectbox('Dataset', filters_leveltwo) with middle: model_size_range = st.selectbox('Model Size', ['All', '<10B', '10B-30B', '>30B']) sortby = 'Ascending' if category_one or category_two or sortby: category_one = category_one_dict[category_one] category_two = category_two_dict[category_two] draw('general_reasoning', category_one, category_two, 'Accuracy', sortby, model_size_range) def flores(): st.title("Task: FLORES-Translation") filters_levelone = ['Zero Shot', 'Few Shot'] filters_leveltwo = [ 'Indonesian to English', 'Vitenamese to English', 'Chinese to English', 'Malay to English' ] category_one_dict = { 'Zero Shot': 'zero_shot', 'Few Shot' : 'few_shot' } category_two_dict = { 'Indonesian to English': 'ind2eng', 'Vitenamese to English': 'vie2eng', 'Chinese to English' : 'zho2eng', 'Malay to English' : 'zsm2eng' } left, center, middle, _, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2]) with left: category_one = st.selectbox('Zero or Few Shot', filters_levelone) with center: category_two = st.selectbox('Dataset', filters_leveltwo) with middle: model_size_range = st.selectbox('Model Size', ['All', '<10B', '10B-30B', '>30B']) sortby = 'Ascending' if category_one or category_two or sortby: category_one = category_one_dict[category_one] category_two = category_two_dict[category_two] draw('flores_translation', category_one, category_two, 'BLEU', sortby, model_size_range) def emotion(): st.title("Task: Emotion") filters_levelone = ['Zero Shot', 'Few Shot'] filters_leveltwo = [ 'Indonesian Emotion Classification', 'SST2', ] category_one_dict = {'Zero Shot': 'zero_shot', 'Few Shot': 'few_shot'} category_two_dict = { 'Indonesian Emotion Classification': 'ind_emotion', 'SST2' : 'sst2' } left, center, middle, _, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2]) with left: category_one = st.selectbox('Zero or Few Shot', filters_levelone) with center: category_two = st.selectbox('Dataset', filters_leveltwo) with middle: model_size_range = st.selectbox('Model Size', ['All', '<10B', '10B-30B', '>30B']) sortby = 'Ascending' if category_one or category_two or sortby: category_one = category_one_dict[category_one] category_two = category_two_dict[category_two] draw('emotion', category_one, category_two, 'Accuracy', sortby, model_size_range) def dialogue(): st.title("Task: Dialogue") filters_levelone = ['Zero Shot', 'Few Shot'] filters_leveltwo = [ 'DREAM', 'SAMSum', 'DialogSum', ] category_one_dict = { 'Zero Shot': 'zero_shot', 'Few Shot' : 'few_shot' } category_two_dict = { 'DREAM' : 'dream', 'SAMSum' : 'samsum', 'DialogSum': 'dialogsum' } left, center, middle, _, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2]) with left: category_one = st.selectbox('Zero or Few Shot', filters_levelone) with center: category_two = st.selectbox('Dataset', filters_leveltwo) with right: if category_two == 'DREAM': sort = st.selectbox('Sort', ['Accuracy']) else: sort = st.selectbox('Sort', ['Average', 'ROUGE-1', 'ROUGE-2', 'ROUGE-L']) with middle: model_size_range = st.selectbox('Model Size', ['All', '<10B', '10B-30B', '>30B']) sortby = 'Ascending' if category_one or category_two or sort or sortby: category_one = category_one_dict[category_one] category_two = category_two_dict[category_two] draw('dialogue', category_one, category_two, sort, sortby, model_size_range) def fundamental_nlp_tasks(): st.title("Task: Fundamental NLP Tasks") filters_levelone = ['Zero Shot', 'Few Shot'] filters_leveltwo = ['OCNLI', 'C3', 'COLA', 'QQP', 'MNLI', 'QNLI', 'WNLI', 'RTE', 'MRPC'] category_one_dict = { 'Zero Shot': 'zero_shot', 'Few Shot' : 'few_shot' } category_two_dict = { 'OCNLI': 'ocnli', 'C3' : 'c3', 'COLA' : 'cola', 'QQP' : 'qqp', 'MNLI' : 'mnli', 'QNLI' : 'qnli', 'WNLI' : 'wnli', 'RTE' : 'rte', 'MRPC' : 'mrpc' } left, center, middle, _, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2]) with left: category_one = st.selectbox('Zero or Few Shot', filters_levelone) with center: category_two = st.selectbox('Dataset', filters_leveltwo) with middle: model_size_range = st.selectbox('Model Size', ['All', '<10B', '10B-30B', '>30B']) sortby = 'Ascending' if category_one or category_two or sortby: category_one = category_one_dict[category_one] category_two = category_two_dict[category_two] draw('fundamental_nlp_tasks', category_one, category_two, 'Accuracy', sortby, model_size_range)