import json import gradio as gr import pandas as pd from statistics import median print("Loading datasets...") # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = def add_rank(df, compute_average=True): cols_to_rank = [col for col in df.columns if col not in ["Model", "Model Size (Params)", "Embedding Dimensions", "Sequence Length"]] if len(cols_to_rank) == 1: df.sort_values(cols_to_rank[0], ascending=False, inplace=True) else: if compute_average: df.insert(1, "Average", df[cols_to_rank].mean(axis=1, skipna=False)) df.sort_values("Average", ascending=False, inplace=True) else: df.sort_values(cols_to_rank[0], ascending=False, inplace=True) df.insert(0, "Rank", list(range(1, len(df) + 1))) df = df.round(2) # Fill NaN after averaging df.fillna("", inplace=True) return df def make_clickable_model(model_name, link=None): if link is None: link = "https://huggingface.co/" + model_name # Remove user from model name return ( f'{model_name.split("/")[-1]}' ) # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = with open('all_results.json', 'r') as f: ALL_RESULTS = json.load(f) MODEL_LIST = list(ALL_RESULTS.keys()) NUM_MODELS = len(set(MODEL_LIST)) MODEL_TO_SIZE = {model: ALL_RESULTS[model]["model_size"] for model in MODEL_LIST} # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = def get_data_cross_mmlu_overall(eval_mode='zero_shot', fillna=True, rank=True): df_list = [] for model in MODEL_LIST: results_list = [ALL_RESULTS[model][eval_mode]['cross_mmlu'][res] for res in ALL_RESULTS[model][eval_mode]['cross_mmlu']] try: overall_acc = [results['overall_acc'] for results in results_list] overall_acc = median(overall_acc) consistency_score_3 = [results['consistency_score_3'] for results in results_list] consistency_score_3 = median(consistency_score_3) AC3_3 = [results['AC3_3'] for results in results_list] AC3_3 = median(AC3_3) except: print(results_list) consistency_score_3 = -1 overall_acc = -1 AC3_3 = -1 res = { "Model Size (Params)": MODEL_TO_SIZE.get(model, ""), "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), "AC3": AC3_3, "Cross-Lingual Consistency": consistency_score_3, "Accuracy": overall_acc, } df_list.append(res) df = pd.DataFrame(df_list) # If there are any models that are the same, merge them # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one df = df.groupby("Model", as_index=False).first() # Put 'Model' column first #cols = sorted(list(df.columns)) cols = list(df.columns) cols.insert(0, cols.pop(cols.index("Model"))) df = df[cols] if rank: df = add_rank(df, compute_average=False) if fillna: df.fillna("", inplace=True) return df CROSS_MMLU_ZERO_SHOT_OVERALL = get_data_cross_mmlu_overall(eval_mode="zero_shot") CROSS_MMLU_FIVE_SHOT_OVERALL = get_data_cross_mmlu_overall(eval_mode="five_shot") def get_data_cross_mmlu_language(eval_mode='zero_shot', fillna=True, rank=True): df_list = [] for model in MODEL_LIST: results_list = [ALL_RESULTS[model][eval_mode]['cross_mmlu'][res] for res in ALL_RESULTS[model][eval_mode]['cross_mmlu']] try: English = [results['language_acc']['English'] for results in results_list] Vietnamese = [results['language_acc']['Vietnamese'] for results in results_list] Chinese = [results['language_acc']['Chinese'] for results in results_list] Indonesian = [results['language_acc']['Indonesian'] for results in results_list] Filipino = [results['language_acc']['Filipino'] for results in results_list] Spanish = [results['language_acc']['Spanish'] for results in results_list] Malay = [results['language_acc']['Malay'] for results in results_list] English = median(English) Vietnamese = median(Vietnamese) Chinese = median(Chinese) Indonesian = median(Indonesian) Filipino = median(Filipino) Spanish = median(Spanish) Malay = median(Malay) except: print(results_list) English = -1 Vietnamese = -1 Chinese = -1 Indonesian = -1 Filipino = -1 Spanish = -1 Malay = -1 res = { "Model Size (Params)": MODEL_TO_SIZE.get(model, ""), "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), "English": English, "Vietnamese": Vietnamese, "Chinese": Chinese, "Indonesian": Indonesian, "Filipino": Filipino, "Spanish": Spanish, "Malay": Malay, } df_list.append(res) df = pd.DataFrame(df_list) # If there are any models that are the same, merge them # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one df = df.groupby("Model", as_index=False).first() # Put 'Model' column first #cols = sorted(list(df.columns)) cols = list(df.columns) cols.insert(0, cols.pop(cols.index("Model"))) df = df[cols] if rank: df = add_rank(df, compute_average=True) if fillna: df.fillna("", inplace=True) return df CROSS_MMLU_ZERO_SHOT_LANGUAGE = get_data_cross_mmlu_language(eval_mode="zero_shot") CROSS_MMLU_FIVE_SHOT_LANGUAGE = get_data_cross_mmlu_language(eval_mode="five_shot") # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = def get_data_cross_logiqa_overall(eval_mode='zero_shot', fillna=True, rank=True): df_list = [] for model in MODEL_LIST: results_list = [ALL_RESULTS[model][eval_mode]['cross_logiqa'][res] for res in ALL_RESULTS[model][eval_mode]['cross_logiqa']] try: overall_acc = [results['overall_acc'] for results in results_list] overall_acc = median(overall_acc) consistency_score_3 = [results['consistency_score_3'] for results in results_list] consistency_score_3 = median(consistency_score_3) AC3_3 = [results['AC3_3'] for results in results_list] AC3_3 = median(AC3_3) except: print(results_list) consistency_score_3 = -1 overall_acc = -1 AC3_3 = -1 res = { "Model Size (Params)": MODEL_TO_SIZE.get(model, ""), "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), "AC3": AC3_3, "Cross-Lingual Consistency": consistency_score_3, "Accuracy": overall_acc, } df_list.append(res) df = pd.DataFrame(df_list) # If there are any models that are the same, merge them # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one df = df.groupby("Model", as_index=False).first() # Put 'Model' column first #cols = sorted(list(df.columns)) cols = list(df.columns) cols.insert(0, cols.pop(cols.index("Model"))) df = df[cols] if rank: df = add_rank(df, compute_average=False) if fillna: df.fillna("", inplace=True) return df CROSS_LOGIQA_ZERO_SHOT_OVERALL = get_data_cross_logiqa_overall(eval_mode="zero_shot") CROSS_LOGIQA_FIVE_SHOT_OVERALL = get_data_cross_logiqa_overall(eval_mode="five_shot") def get_data_cross_logiqa_language(eval_mode='zero_shot', fillna=True, rank=True): df_list = [] for model in MODEL_LIST: results_list = [ALL_RESULTS[model][eval_mode]['cross_logiqa'][res] for res in ALL_RESULTS[model][eval_mode]['cross_logiqa']] try: English = [results['language_acc']['English'] for results in results_list] Vietnamese = [results['language_acc']['Vietnamese'] for results in results_list] Chinese = [results['language_acc']['Chinese'] for results in results_list] Indonesian = [results['language_acc']['Indonesian'] for results in results_list] Filipino = [results['language_acc']['Filipino'] for results in results_list] Spanish = [results['language_acc']['Spanish'] for results in results_list] Malay = [results['language_acc']['Malay'] for results in results_list] English = median(English) Vietnamese = median(Vietnamese) Chinese = median(Chinese) Indonesian = median(Indonesian) Filipino = median(Filipino) Spanish = median(Spanish) Malay = median(Malay) except: print(results_list) English = -1 Vietnamese = -1 Chinese = -1 Indonesian = -1 Filipino = -1 Spanish = -1 Malay = -1 res = { "Model Size (Params)": MODEL_TO_SIZE.get(model, ""), "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), "English": English, "Vietnamese": Vietnamese, "Chinese": Chinese, "Indonesian": Indonesian, "Filipino": Filipino, "Spanish": Spanish, "Malay": Malay, } df_list.append(res) df = pd.DataFrame(df_list) # If there are any models that are the same, merge them # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one df = df.groupby("Model", as_index=False).first() # Put 'Model' column first #cols = sorted(list(df.columns)) cols = list(df.columns) cols.insert(0, cols.pop(cols.index("Model"))) df = df[cols] if rank: df = add_rank(df, compute_average=True) if fillna: df.fillna("", inplace=True) return df CROSS_LOGIQA_ZERO_SHOT_LANGUAGE = get_data_cross_logiqa_language(eval_mode="zero_shot") CROSS_LOGIQA_FIVE_SHOT_LANGUAGE = get_data_cross_logiqa_language(eval_mode="five_shot") # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = def get_data_sg_eval(eval_mode='zero_shot', fillna=True, rank=True): df_list = [] for model in MODEL_LIST: results_list = [ALL_RESULTS[model][eval_mode]['sg_eval'][res] for res in ALL_RESULTS[model][eval_mode]['sg_eval']] try: accuracy = median([results['accuracy'] for results in results_list]) except: print(results_list) accuracy = -1 res = { "Model Size (Params)": MODEL_TO_SIZE.get(model, ""), "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), "Accuracy": accuracy, } df_list.append(res) df = pd.DataFrame(df_list) # If there are any models that are the same, merge them # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one df = df.groupby("Model", as_index=False).first() # Put 'Model' column first #cols = sorted(list(df.columns)) cols = list(df.columns) cols.insert(0, cols.pop(cols.index("Model"))) df = df[cols] if rank: df = add_rank(df, compute_average=True) if fillna: df.fillna("", inplace=True) return df SG_EVAL_ZERO_SHOT = get_data_sg_eval(eval_mode="zero_shot") SG_EVAL_FIVE_SHOT = get_data_sg_eval(eval_mode="five_shot") # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = def get_data_us_eval(eval_mode='zero_shot', fillna=True, rank=True): df_list = [] for model in MODEL_LIST: results_list = [ALL_RESULTS[model][eval_mode]['us_eval'][res] for res in ALL_RESULTS[model][eval_mode]['us_eval']] try: accuracy = median([results['accuracy'] for results in results_list]) except: print(results_list) accuracy = -1 res = { "Model Size (Params)": MODEL_TO_SIZE.get(model, ""), "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), "Accuracy": accuracy, } df_list.append(res) df = pd.DataFrame(df_list) # If there are any models that are the same, merge them # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one df = df.groupby("Model", as_index=False).first() # Put 'Model' column first #cols = sorted(list(df.columns)) cols = list(df.columns) cols.insert(0, cols.pop(cols.index("Model"))) df = df[cols] if rank: df = add_rank(df, compute_average=True) if fillna: df.fillna("", inplace=True) return df US_EVAL_ZERO_SHOT = get_data_us_eval(eval_mode="zero_shot") US_EVAL_FIVE_SHOT = get_data_us_eval(eval_mode="five_shot") # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = def get_data_cn_eval(eval_mode='zero_shot', fillna=True, rank=True): df_list = [] for model in MODEL_LIST: results_list = [ALL_RESULTS[model][eval_mode]['cn_eval'][res] for res in ALL_RESULTS[model][eval_mode]['cn_eval']] try: accuracy = median([results['accuracy'] for results in results_list]) except: print(results_list) accuracy = -1 res = { "Model Size (Params)": MODEL_TO_SIZE.get(model, ""), "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), "Accuracy": accuracy, } df_list.append(res) df = pd.DataFrame(df_list) # If there are any models that are the same, merge them # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one df = df.groupby("Model", as_index=False).first() # Put 'Model' column first #cols = sorted(list(df.columns)) cols = list(df.columns) cols.insert(0, cols.pop(cols.index("Model"))) df = df[cols] if rank: df = add_rank(df, compute_average=True) if fillna: df.fillna("", inplace=True) return df CN_EVAL_ZERO_SHOT = get_data_cn_eval(eval_mode="zero_shot") CN_EVAL_FIVE_SHOT = get_data_cn_eval(eval_mode="five_shot") # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = def get_data_ph_eval(eval_mode='zero_shot', fillna=True, rank=True): df_list = [] for model in MODEL_LIST: results_list = [ALL_RESULTS[model][eval_mode]['ph_eval'][res] for res in ALL_RESULTS[model][eval_mode]['ph_eval']] try: accuracy = median([results['accuracy'] for results in results_list]) except: print(results_list) accuracy = -1 res = { "Model Size (Params)": MODEL_TO_SIZE.get(model, ""), "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), "Accuracy": accuracy, } df_list.append(res) df = pd.DataFrame(df_list) # If there are any models that are the same, merge them # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one df = df.groupby("Model", as_index=False).first() # Put 'Model' column first #cols = sorted(list(df.columns)) cols = list(df.columns) cols.insert(0, cols.pop(cols.index("Model"))) df = df[cols] if rank: df = add_rank(df, compute_average=True) if fillna: df.fillna("", inplace=True) return df PH_EVAL_ZERO_SHOT = get_data_ph_eval(eval_mode="zero_shot") PH_EVAL_FIVE_SHOT = get_data_ph_eval(eval_mode="five_shot") # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = block = gr.Blocks() with block: gr.Markdown(f""" SeaEval Leaderboard. To submit, refer to the SeaEval Website Refer to the [SeaEval paper](https://arxiv.org/abs/2309.04766) for details on metrics, tasks and models. - **Total Datasets**: 31 - **Total Languages**: 8 - **Total Models**: {NUM_MODELS} """) with gr.Tabs(): # dataset 1: cross-mmlu with gr.TabItem("Cross-MMLU"): with gr.Row(): gr.Markdown(""" **Cross-MMLU Leaderboard** 🔮 - **Metric:** Cross-Lingual Consistency, Accuracy, AC3 - **Languages:** English, Chinese, Malay, Indonesian, Spanish, Vietnamese, Filipino """) with gr.TabItem("zero_shot"): with gr.TabItem("Overall"): with gr.Row(): cross_mmlu_zero_shot_overall = gr.components.Dataframe( CROSS_MMLU_ZERO_SHOT_OVERALL, datatype=["number", "markdown"] + ["number"] * len(CROSS_MMLU_ZERO_SHOT_OVERALL.columns), type="pandas", ) with gr.TabItem("Language Performance"): with gr.Row(): cross_mmlu_zero_shot_overall = gr.components.Dataframe( CROSS_MMLU_ZERO_SHOT_LANGUAGE, datatype=["number", "markdown"] + ["number"] * len(CROSS_MMLU_ZERO_SHOT_LANGUAGE.columns), type="pandas", ) with gr.TabItem("five_shot"): with gr.TabItem("Overall"): with gr.Row(): cross_mmlu_zero_shot_overall = gr.components.Dataframe( CROSS_MMLU_FIVE_SHOT_OVERALL, datatype=["number", "markdown"] + ["number"] * len(CROSS_MMLU_FIVE_SHOT_OVERALL.columns), type="pandas", ) with gr.TabItem("Language Performance"): with gr.Row(): gr.components.Dataframe( CROSS_MMLU_FIVE_SHOT_LANGUAGE, datatype=["number", "markdown"] + ["number"] * len(CROSS_MMLU_FIVE_SHOT_LANGUAGE.columns), type="pandas", ) # dataset 2: cross-logiqa with gr.TabItem("Cross-LogiQA"): with gr.Row(): gr.Markdown(""" **Cross-LogiQA Leaderboard** 🔮 - **Metric:** Cross-Lingual Consistency, Accuracy, AC3 - **Languages:** English, Chinese, Malay, Indonesian, Spanish, Vietnamese, Filipino """) with gr.TabItem("zero_shot"): with gr.TabItem("Overall"): with gr.Row(): gr.components.Dataframe( CROSS_LOGIQA_ZERO_SHOT_OVERALL, datatype=["number", "markdown"] + ["number"] * len(CROSS_LOGIQA_ZERO_SHOT_OVERALL.columns), type="pandas", ) with gr.TabItem("Language Performance"): with gr.Row(): gr.components.Dataframe( CROSS_LOGIQA_ZERO_SHOT_LANGUAGE, datatype=["number", "markdown"] + ["number"] * len(CROSS_LOGIQA_ZERO_SHOT_LANGUAGE.columns), type="pandas", ) with gr.TabItem("five_shot"): with gr.TabItem("Overall"): with gr.Row(): gr.components.Dataframe( CROSS_LOGIQA_FIVE_SHOT_OVERALL, datatype=["number", "markdown"] + ["number"] * len(CROSS_LOGIQA_FIVE_SHOT_OVERALL.columns), type="pandas", ) with gr.TabItem("Language Performance"): with gr.Row(): gr.components.Dataframe( CROSS_LOGIQA_FIVE_SHOT_LANGUAGE, datatype=["number", "markdown"] + ["number"] * len(CROSS_LOGIQA_FIVE_SHOT_LANGUAGE.columns), type="pandas", ) # dataset 3: SG_EVAL with gr.TabItem("SG_EVAL"): with gr.Row(): gr.Markdown(""" **SG_EVAL Leaderboard** 🔮 - **Metric:** Accuracy - **Languages:** English """) with gr.TabItem("zero_shot"): with gr.TabItem("Overall"): with gr.Row(): gr.components.Dataframe( SG_EVAL_ZERO_SHOT, datatype=["number", "markdown"] + ["number"] * len(SG_EVAL_ZERO_SHOT.columns), type="pandas", ) with gr.TabItem("five_shot"): with gr.TabItem("Overall"): with gr.Row(): gr.components.Dataframe( SG_EVAL_FIVE_SHOT, datatype=["number", "markdown"] + ["number"] * len(SG_EVAL_FIVE_SHOT.columns), type="pandas", ) # dataset 4: with gr.TabItem("US_EVAL"): with gr.Row(): gr.Markdown(""" **US_EVAL Leaderboard** 🔮 - **Metric:** Accuracy - **Languages:** English """) with gr.TabItem("zero_shot"): with gr.TabItem("Overall"): with gr.Row(): gr.components.Dataframe( US_EVAL_ZERO_SHOT, datatype=["number", "markdown"] + ["number"] * len(US_EVAL_ZERO_SHOT.columns), type="pandas", ) with gr.TabItem("five_shot"): with gr.TabItem("Overall"): with gr.Row(): gr.components.Dataframe( US_EVAL_FIVE_SHOT, datatype=["number", "markdown"] + ["number"] * len(US_EVAL_FIVE_SHOT.columns), type="pandas", ) # dataset 5: with gr.TabItem("CN_EVAL"): with gr.Row(): gr.Markdown(""" **CN_EVAL Leaderboard** 🔮 - **Metric:** Accuracy - **Languages:** Chinese """) with gr.TabItem("zero_shot"): with gr.TabItem("Overall"): with gr.Row(): gr.components.Dataframe( CN_EVAL_ZERO_SHOT, datatype=["number", "markdown"] + ["number"] * len(CN_EVAL_ZERO_SHOT.columns), type="pandas", ) with gr.TabItem("five_shot"): with gr.TabItem("Overall"): with gr.Row(): gr.components.Dataframe( CN_EVAL_FIVE_SHOT, datatype=["number", "markdown"] + ["number"] * len(CN_EVAL_FIVE_SHOT.columns), type="pandas", ) # dataset 6: with gr.TabItem("PH_EVAL"): with gr.Row(): gr.Markdown(""" **PH_EVAL Leaderboard** 🔮 - **Metric:** Accuracy - **Languages:** English """) with gr.TabItem("zero_shot"): with gr.TabItem("Overall"): with gr.Row(): gr.components.Dataframe( PH_EVAL_ZERO_SHOT, datatype=["number", "markdown"] + ["number"] * len(PH_EVAL_ZERO_SHOT.columns), type="pandas", ) with gr.TabItem("five_shot"): with gr.TabItem("Overall"): with gr.Row(): gr.components.Dataframe( PH_EVAL_FIVE_SHOT, datatype=["number", "markdown"] + ["number"] * len(PH_EVAL_FIVE_SHOT.columns), type="pandas", ) gr.Markdown(r""" If this work is useful to you, please citing our work: ```bibtex @article{SeaEval2023, title={SeaEval for Multilingual Foundation Models: From Cross-Lingual Alignment to Cultural Reasoning}, author={Wang, Bin and Liu, Zhengyuan and Huang, Xin and Jiao, Fangkai and Ding, Yang and Aw, Ai Ti and Chen, Nancy F.}, journal={arXiv preprint arXiv:2309.04766}, year={2023} } ``` """) # Running the functions on page load in addition to when the button is clicked # This is optional - If deactivated the data loaded at "Build time" is shown like for Overall tab """ block.load(get_mteb_data, inputs=[task_bitext_mining], outputs=data_bitext_mining) """ block.queue(max_size=10) block.launch(server_name="0.0.0.0", share=True) # Possible changes: # Could add graphs / other visual content # Could add verification marks # Sources: # https://huggingface.co/spaces/gradio/leaderboard # https://huggingface.co/spaces/huggingface-projects/Deep-Reinforcement-Learning-Leaderboard # https://getemoji.com/