import streamlit as st from app.draw_diagram import * def dashboard(): with st.container(): st.title("SeaEval") st.markdown(""" [gh]: https://github.com/SeaEval/SeaEval [![GitHub watchers](https://img.shields.io/github/watchers/SeaEval/SeaEval?style=social)][gh] [![GitHub Repo stars](https://img.shields.io/github/stars/SeaEval/SeaEval?style=social)][gh] """) seaeval_url = "https://seaeval.github.io/" st.divider() st.markdown("#### What is [SeaEval](%s)" % seaeval_url) with st.container(): left_co, cent_co,last_co = st.columns(3) with cent_co: st.image("./style/seaeval_overall.png", # caption="SeaEval data range", width=500) st.markdown(''' ''') st.markdown("##### A new benchmark for multilingual foundation models consisting of 28 dataset.") st.markdown(''':star: How models understand and reason with natural language? :balloon: Languages: English, Chinese, Malay, Spainish, Indonedian, Vietnamese, Filipino. ''') st.markdown(''':star: How models comprehend cultural practices, nuances and values? :balloon: 4 new datasets on Cultural Understanding. ''') st.markdown(''':star: How models perform across languages in terms of consistency? :balloon: 2 new datasets with curated metrics for Cross-Linugal Consistency. ''') with st.container(): left_co, cent_co,last_co = st.columns(3) with cent_co: st.image("./style/consistency.png", # caption="SeaEval data range", width=500) st.markdown("##### Evaluation with enhanced cross-lingual capabilities.") st.markdown(''':star: How models perform according to different (paraphrased) instructions? :balloon: Each dataset is equipped with 5 different prompts to avoid randomness introduced by instructions, which is non-negligible.. ''') st.markdown(''':star: Multilingual accuracy and performance consistency across languages. :balloon: If you can answer the question in your native language, can you answer the same question correctly in your second/third language? ''') st.divider() with st.container(): st.markdown("##### Citations") st.markdown(''' :round_pushpin: SeaEval Paper \n @article{SeaEval, title={SeaEval for Multilingual Foundation Models: From Cross-Lingual Alignment to Cultural Reasoning}, author={Wang, Bin and Liu, Zhengyuan and Huang, Xin and Jiao, Fangkai and Ding, Yang and Aw, Ai Ti and Chen, Nancy F.}, journal={NAACL}, year={2024} } ''') def cross_lingual_consistency(): st.title("Cross-Lingual Consistency") filters_levelone = ['Zero Shot', 'Few Shot'] filters_leveltwo = ['Cross-MMLU', 'Cross-XQUAD', 'Cross-LogiQA'] category_one_dict = {'Zero Shot': 'zero_shot', 'Few Shot': 'few_shot'} category_two_dict = {'Cross-MMLU': 'cross_mmlu', 'Cross-XQUAD': 'cross_xquad', 'Cross-LogiQA': 'cross_logiqa'} left, center, _, middle,right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2]) with left: category_one = st.selectbox('Select Zero / Few shot', filters_levelone) with center: category_two = st.selectbox('Select the sub-category', filters_leveltwo) with middle: sort = st.selectbox('Sort', ['Accuracy','Cross-Lingual Consistency', 'AC3', 'English', 'Chinese', 'Spanish', 'Vietnamese']) with right: sorted = st.selectbox('by', ['Ascending', 'Descending']) if category_one or category_two or sort or sorted: category_one = category_one_dict[category_one] category_two = category_two_dict[category_two] draw_cross_lingual(category_one, category_two, sort, sorted) else: draw_cross_lingual('zero_shot', 'cross_mmlu', 'Accuracy', 'Descending') def cultural_reasoning(): st.title("Cultural Reasoning") filters_levelone = ['Zero Shot', 'Few Shot'] filters_leveltwo = ['SG EVAL', 'SG EVAL V1 Cleaned', 'SG EVAL V2 MCQ', 'SG EVAL V2 Open Ended', 'CN EVAL', 'PH EVAL', 'US EVAL'] category_one_dict = {'Zero Shot': 'zero_shot', 'Few Shot': 'few_shot'} left, center, _, right = st.columns([0.2, 0.2, 0.4, 0.2]) with left: category_one = st.selectbox('Select Zero / Few shot', filters_levelone) with center: category_two = st.selectbox('Select the sub-category', filters_leveltwo) with right: sorted = st.selectbox('sorted by', ['Ascending', 'Descending']) if category_one or category_two or sorted: category_one = category_one_dict[category_one] draw_only_acc('cultural_reasoning', category_one, category_two, sorted) else: draw_only_acc('cultural_reasoning', 'zero_shot', 'sg_eval', 'Descending') def general_reasoning(): st.title("General Reasoning") filters_levelone = ['Zero Shot', 'Few Shot'] filters_leveltwo = ['MMLU', 'C Eval', 'CMMLU', 'ZBench', 'IndoMMLU'] category_one_dict = {'Zero Shot': 'zero_shot', 'Few Shot': 'few_shot'} left, center, _, right = st.columns([0.2, 0.2, 0.4, 0.2]) with left: category_one = st.selectbox('Select Zero / Few shot', filters_levelone) with center: category_two = st.selectbox('Select the sub-category', filters_leveltwo) with right: sorted = st.selectbox('sorted by', ['Ascending', 'Descending']) if category_one or category_two or sorted: category_one = category_one_dict[category_one] draw_only_acc('general_reasoning', category_one, category_two, sorted) else: draw_only_acc('general_reasoning', 'zero_shot', 'MMLU Full', 'Descending') def flores(): st.title("FLORES-Translation") filters_levelone = ['Zero Shot', 'Few Shot'] filters_leveltwo = ['Indonesian to English', 'Vitenamese to English', 'Chinese to English', 'Malay to English'] category_one_dict = {'Zero Shot': 'zero_shot', 'Few Shot': 'few_shot'} left, center, _, right = st.columns([0.2, 0.2, 0.4, 0.2]) with left: category_one = st.selectbox('Select Zero / Few shot', filters_levelone) with center: category_two = st.selectbox('Select the sub-category', filters_leveltwo) with right: sorted = st.selectbox('sorted by', ['Ascending', 'Descending']) if category_one or category_two or sorted: category_one = category_one_dict[category_one] draw_flores_translation(category_one, category_two, sorted) else: draw_flores_translation('zero_shot', 'Indonesian to English', 'Descending') def emotion(): st.title("Emotion") filters_levelone = ['Zero Shot', 'Few Shot'] filters_leveltwo = ['Indonesian Emotion Classification', 'SST2'] category_one_dict = {'Zero Shot': 'zero_shot', 'Few Shot': 'few_shot'} left, center, _, right = st.columns([0.2, 0.2, 0.4, 0.2]) with left: category_one = st.selectbox('Select Zero / Few shot', filters_levelone) with center: category_two = st.selectbox('Select the sub-category', filters_leveltwo) with right: sorted = st.selectbox('sorted by', ['Ascending', 'Descending']) if category_one or category_two or sorted: category_one = category_one_dict[category_one] draw_only_acc('emotion', category_one, category_two, sorted) else: draw_only_acc('emotion', 'zero_shot', 'Indonesian Emotion Classification', 'Descending') def dialogue(): st.title("Dialogue") filters_levelone = ['Zero Shot', 'Few Shot'] filters_leveltwo = ['DREAM', 'SAMSum', 'DialogSum'] category_one_dict = {'Zero Shot': 'zero_shot', 'Few Shot': 'few_shot'} left, center, _, middle,right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2]) with left: category_one = st.selectbox('Select Zero / Few shot', filters_levelone) with center: category_two = st.selectbox('Select the sub-category', filters_leveltwo) with middle: if category_two == 'DREAM': sort = st.selectbox('Sort', ['Accuracy']) else: sort = st.selectbox('Sort', ['Average', 'ROUGE-1', 'ROUGE-2', 'ROUGE-L']) with right: sorted = st.selectbox('by', ['Ascending', 'Descending']) if category_one or category_two or sort or sorted: category_one = category_one_dict[category_one] draw_dialogue(category_one, category_two, sort, sorted) else: draw_dialogue('zero_shot', 'DREAM', sort[0],'Descending') def fundamental_nlp_tasks(): st.title("Fundamental NLP Tasks") filters_levelone = ['Zero Shot', 'Few Shot'] filters_leveltwo = ['OCNLI', 'C3', 'COLA', 'QQP', 'MNLI', 'QNLI', 'WNLI', 'RTE', 'MRPC'] category_one_dict = {'Zero Shot': 'zero_shot', 'Few Shot': 'few_shot'} left, center, _, right = st.columns([0.2, 0.2, 0.4, 0.2]) with left: category_one = st.selectbox('Select Zero / Few shot', filters_levelone) with center: category_two = st.selectbox('Select the sub-category', filters_leveltwo) with right: sorted = st.selectbox('sorted by', ['Ascending', 'Descending']) if category_one or category_two or sorted: category_one = category_one_dict[category_one] draw_only_acc('fundamental_nlp_tasks', category_one, category_two, sorted) else: draw_only_acc('fundamental_nlp_tasks', 'zero_shot', 'OCNLI', 'Descending')