import json

import gradio as gr
import pandas as pd

from statistics import median


print("Loading datasets...")

# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =

def add_rank(df, compute_average=True):
    cols_to_rank = [col for col in df.columns if col not in ["Model", "Model Size (Params)", "Embedding Dimensions", "Sequence Length"]]
    if len(cols_to_rank) == 1:
        df.sort_values(cols_to_rank[0], ascending=False, inplace=True)
    else:
        if compute_average:
            df.insert(1, "Average", df[cols_to_rank].mean(axis=1, skipna=False))
            df.sort_values("Average", ascending=False, inplace=True)
        else:
            df.sort_values(cols_to_rank[0], ascending=False, inplace=True)

    df.insert(0, "Rank", list(range(1, len(df) + 1)))
    df = df.round(2)
    # Fill NaN after averaging
    df.fillna("", inplace=True)
    return df

def make_clickable_model(model_name, link=None):
    if link is None:
        link = "https://huggingface.co/" + model_name
    # Remove user from model name
    return (
        f'<a target="_blank" style="text-decoration: underline" href="{link}">{model_name.split("/")[-1]}</a>'
    )


# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
with open('all_results.json', 'r') as f:
    ALL_RESULTS = json.load(f)

MODEL_LIST = list(ALL_RESULTS.keys())
NUM_MODELS = len(set(MODEL_LIST))
MODEL_TO_SIZE = {model: ALL_RESULTS[model]["model_size"] for model in MODEL_LIST}
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =


# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =

def get_data_cross_xquad_overall(eval_mode='zero_shot', fillna=True, rank=True):

    df_list = []
    
    for model in MODEL_LIST:
       
        try:
            results_list = [ALL_RESULTS[model][eval_mode]['cross_xquad'][res] for res in ALL_RESULTS[model][eval_mode]['cross_xquad']]

            overall_acc = [results['overall_acc'] for results in results_list]
            overall_acc = median(overall_acc)

            consistency_score_3 = [results['consistency_score_3'] for results in results_list]
            consistency_score_3 = median(consistency_score_3)

            AC3_3 = [results['AC3_3'] for results in results_list]
            AC3_3 = median(AC3_3)

            res = {
                "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
                "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
                "Accuracy": overall_acc,
                "Cross-Lingual Consistency": consistency_score_3,
                "AC3": AC3_3,
            }

            df_list.append(res)

        except:
            print('Not found in model: {} for {}'.format(model, "cross_xquad_overall"))


    df = pd.DataFrame(df_list)
    # If there are any models that are the same, merge them
    # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
    df = df.groupby("Model", as_index=False).first()
    # Put 'Model' column first
    #cols = sorted(list(df.columns))
    cols = list(df.columns)
    cols.insert(0, cols.pop(cols.index("Model")))
    df = df[cols]

    if rank:
        df = add_rank(df, compute_average=False)       

    if fillna:
        df.fillna("", inplace=True)

    return df

CROSS_XQUAD_ZERO_SHOT_OVERALL = get_data_cross_xquad_overall(eval_mode="zero_shot")
CROSS_XQUAD_FIVE_SHOT_OVERALL = get_data_cross_xquad_overall(eval_mode="five_shot")


def get_data_cross_xquad_language(eval_mode='zero_shot', fillna=True, rank=True):

    df_list = []
    
    for model in MODEL_LIST:
   
        try:
            results_list = [ALL_RESULTS[model][eval_mode]['cross_xquad'][res] for res in ALL_RESULTS[model][eval_mode]['cross_xquad']]

            English    = [results['language_acc']['English'] for results in results_list]
            Vietnamese = [results['language_acc']['Vietnamese'] for results in results_list]
            Chinese    = [results['language_acc']['Chinese'] for results in results_list]
            Spanish    = [results['language_acc']['Spanish'] for results in results_list]

            English    = median(English)
            Vietnamese = median(Vietnamese) 
            Chinese    = median(Chinese)
            Spanish    = median(Spanish)

            res = {
                "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
                "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
                "English": English,
                "Vietnamese": Vietnamese,
                "Chinese": Chinese,
                "Spanish": Spanish,
            }

            df_list.append(res)

        except:
            print('Not found in model: {} for {}'.format(model, "cross_xquad_lang"))


    df = pd.DataFrame(df_list)
    # If there are any models that are the same, merge them
    # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
    df = df.groupby("Model", as_index=False).first()
    # Put 'Model' column first
    #cols = sorted(list(df.columns))
    cols = list(df.columns)
    cols.insert(0, cols.pop(cols.index("Model")))
    df = df[cols]

    if rank:
        df = add_rank(df, compute_average=False)       

    if fillna:
        df.fillna("", inplace=True)

    return df

CROSS_XQUAD_ZERO_SHOT_LANGUAGE = get_data_cross_xquad_language(eval_mode="zero_shot")
CROSS_XQUAD_FIVE_SHOT_LANGUAGE = get_data_cross_xquad_language(eval_mode="five_shot")

# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =


# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =

def get_data_cross_mmlu_overall(eval_mode='zero_shot', fillna=True, rank=True):

    df_list = []
    
    for model in MODEL_LIST:
       
        try:

            results_list = [ALL_RESULTS[model][eval_mode]['cross_mmlu'][res] for res in ALL_RESULTS[model][eval_mode]['cross_mmlu']]

            overall_acc = [results['overall_acc'] for results in results_list]
            overall_acc = median(overall_acc)

            consistency_score_3 = [results['consistency_score_3'] for results in results_list]
            consistency_score_3 = median(consistency_score_3)

            AC3_3 = [results['AC3_3'] for results in results_list]
            AC3_3 = median(AC3_3)

            res = {
                "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
                "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
                "Accuracy": overall_acc,
                "Cross-Lingual Consistency": consistency_score_3,
                "AC3": AC3_3,
            }
            df_list.append(res)

        except:
            print('Not found in model: {} for {}'.format(model, "cross_mmlu_overall"))


    df = pd.DataFrame(df_list)
    # If there are any models that are the same, merge them
    # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
    df = df.groupby("Model", as_index=False).first()
    # Put 'Model' column first
    #cols = sorted(list(df.columns))
    cols = list(df.columns)
    cols.insert(0, cols.pop(cols.index("Model")))
    df = df[cols]

    if rank:
        df = add_rank(df, compute_average=False)       

    if fillna:
        df.fillna("", inplace=True)

    return df

CROSS_MMLU_ZERO_SHOT_OVERALL = get_data_cross_mmlu_overall(eval_mode="zero_shot")
CROSS_MMLU_FIVE_SHOT_OVERALL = get_data_cross_mmlu_overall(eval_mode="five_shot")


def get_data_cross_mmlu_language(eval_mode='zero_shot', fillna=True, rank=True):

    df_list = []
    
    for model in MODEL_LIST:
      
        try:

            results_list = [ALL_RESULTS[model][eval_mode]['cross_mmlu'][res] for res in ALL_RESULTS[model][eval_mode]['cross_mmlu']]

            English    = [results['language_acc']['English'] for results in results_list]
            Vietnamese = [results['language_acc']['Vietnamese'] for results in results_list]
            Chinese    = [results['language_acc']['Chinese'] for results in results_list]
            Indonesian = [results['language_acc']['Indonesian'] for results in results_list]
            Filipino   = [results['language_acc']['Filipino'] for results in results_list]
            Spanish    = [results['language_acc']['Spanish'] for results in results_list]
            Malay      = [results['language_acc']['Malay'] for results in results_list]

            English    = median(English)
            Vietnamese = median(Vietnamese) 
            Chinese    = median(Chinese)
            Indonesian = median(Indonesian)
            Filipino   = median(Filipino)
            Spanish    = median(Spanish)
            Malay      = median(Malay)

            res = {
                "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
                "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
                "English": English,
                "Vietnamese": Vietnamese,
                "Chinese": Chinese,
                "Indonesian": Indonesian,
                "Filipino": Filipino,
                "Spanish": Spanish,
                "Malay": Malay,
            }

            df_list.append(res)

        except:
            print('Not found in model: {} for {}'.format(model, "cross_mmlu_lang"))

    df = pd.DataFrame(df_list)
    # If there are any models that are the same, merge them
    # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
    df = df.groupby("Model", as_index=False).first()
    # Put 'Model' column first
    #cols = sorted(list(df.columns))
    cols = list(df.columns)
    cols.insert(0, cols.pop(cols.index("Model")))
    df = df[cols]

    if rank:
        df = add_rank(df, compute_average=False)       

    if fillna:
        df.fillna("", inplace=True)

    return df

CROSS_MMLU_ZERO_SHOT_LANGUAGE = get_data_cross_mmlu_language(eval_mode="zero_shot")
CROSS_MMLU_FIVE_SHOT_LANGUAGE = get_data_cross_mmlu_language(eval_mode="five_shot")

# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =


def get_data_cross_logiqa_overall(eval_mode='zero_shot', fillna=True, rank=True):

    df_list = []
    
    for model in MODEL_LIST:
      
        try:

            results_list = [ALL_RESULTS[model][eval_mode]['cross_logiqa'][res] for res in ALL_RESULTS[model][eval_mode]['cross_logiqa']]

            overall_acc = [results['overall_acc'] for results in results_list]
            overall_acc = median(overall_acc)

            consistency_score_3 = [results['consistency_score_3'] for results in results_list]
            consistency_score_3 = median(consistency_score_3)

            AC3_3 = [results['AC3_3'] for results in results_list]
            AC3_3 = median(AC3_3)

            res = {
                "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
                "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
                "Accuracy": overall_acc,
                "Cross-Lingual Consistency": consistency_score_3,
                "AC3": AC3_3,
            }

            df_list.append(res)

        except:
            print('Not found in model: {} for {}'.format(model, "cross_logiqa_overall"))


    df = pd.DataFrame(df_list)
    # If there are any models that are the same, merge them
    # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
    df = df.groupby("Model", as_index=False).first()
    # Put 'Model' column first
    #cols = sorted(list(df.columns))
    cols = list(df.columns)
    cols.insert(0, cols.pop(cols.index("Model")))
    df = df[cols]

    if rank:
        df = add_rank(df, compute_average=False)       

    if fillna:
        df.fillna("", inplace=True)

    return df


CROSS_LOGIQA_ZERO_SHOT_OVERALL = get_data_cross_logiqa_overall(eval_mode="zero_shot")
CROSS_LOGIQA_FIVE_SHOT_OVERALL = get_data_cross_logiqa_overall(eval_mode="five_shot")


def get_data_cross_logiqa_language(eval_mode='zero_shot', fillna=True, rank=True):

    df_list = []
    
    for model in MODEL_LIST:
      
        try:

            results_list = [ALL_RESULTS[model][eval_mode]['cross_logiqa'][res] for res in ALL_RESULTS[model][eval_mode]['cross_logiqa']]

            English    = [results['language_acc']['English'] for results in results_list]
            Vietnamese = [results['language_acc']['Vietnamese'] for results in results_list]
            Chinese    = [results['language_acc']['Chinese'] for results in results_list]
            Indonesian = [results['language_acc']['Indonesian'] for results in results_list]
            Filipino   = [results['language_acc']['Filipino'] for results in results_list]
            Spanish    = [results['language_acc']['Spanish'] for results in results_list]
            Malay      = [results['language_acc']['Malay'] for results in results_list]

            English    = median(English)
            Vietnamese = median(Vietnamese) 
            Chinese    = median(Chinese)
            Indonesian = median(Indonesian)
            Filipino   = median(Filipino)
            Spanish    = median(Spanish)
            Malay      = median(Malay)

            res = {
                "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
                "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
                "English": English,
                "Vietnamese": Vietnamese,
                "Chinese": Chinese,
                "Indonesian": Indonesian,
                "Filipino": Filipino,
                "Spanish": Spanish,
                "Malay": Malay,
            }

            df_list.append(res)

        except:
            print('Not found in model: {} for {}'.format(model, "cross_logiqa_language"))

            
    df = pd.DataFrame(df_list)
    # If there are any models that are the same, merge them
    # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
    df = df.groupby("Model", as_index=False).first()
    # Put 'Model' column first
    #cols = sorted(list(df.columns))
    cols = list(df.columns)
    cols.insert(0, cols.pop(cols.index("Model")))
    df = df[cols]

    if rank:
        df = add_rank(df, compute_average=False)       

    if fillna:
        df.fillna("", inplace=True)

    return df


CROSS_LOGIQA_ZERO_SHOT_LANGUAGE = get_data_cross_logiqa_language(eval_mode="zero_shot")
CROSS_LOGIQA_FIVE_SHOT_LANGUAGE = get_data_cross_logiqa_language(eval_mode="five_shot")

# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =


def get_data_sg_eval(eval_mode='zero_shot', fillna=True, rank=True):

    df_list = []
    
    for model in MODEL_LIST:
       
        try:
    
            results_list = [ALL_RESULTS[model][eval_mode]['sg_eval'][res] for res in ALL_RESULTS[model][eval_mode]['sg_eval']]
            accuracy = median([results['accuracy'] for results in results_list])

            res = {
                "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
                "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
                "Accuracy": accuracy,
            }

            df_list.append(res)

        except:
            print('Not found in model: {} for {}'.format(model, "sg_eval"))


    df = pd.DataFrame(df_list)
    # If there are any models that are the same, merge them
    # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
    df = df.groupby("Model", as_index=False).first()
    # Put 'Model' column first
    #cols = sorted(list(df.columns))
    cols = list(df.columns)
    cols.insert(0, cols.pop(cols.index("Model")))
    df = df[cols]

    if rank:
        df = add_rank(df, compute_average=True)       

    if fillna:
        df.fillna("", inplace=True)

    return df


SG_EVAL_ZERO_SHOT = get_data_sg_eval(eval_mode="zero_shot")
SG_EVAL_FIVE_SHOT = get_data_sg_eval(eval_mode="five_shot")


# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =


def get_data_us_eval(eval_mode='zero_shot', fillna=True, rank=True):

    df_list = []
    
    for model in MODEL_LIST:

        try:
            results_list = [ALL_RESULTS[model][eval_mode]['us_eval'][res] for res in ALL_RESULTS[model][eval_mode]['us_eval']]
            accuracy = median([results['accuracy'] for results in results_list])

            res = {
                "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
                "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
                "Accuracy": accuracy,
            }

            df_list.append(res)

        except:
            print('Not found in model: {} for {}'.format(model, "us_eval"))


    df = pd.DataFrame(df_list)
    # If there are any models that are the same, merge them
    # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
    df = df.groupby("Model", as_index=False).first()
    # Put 'Model' column first
    #cols = sorted(list(df.columns))
    cols = list(df.columns)
    cols.insert(0, cols.pop(cols.index("Model")))
    df = df[cols]

    if rank:
        df = add_rank(df, compute_average=True)       

    if fillna:
        df.fillna("", inplace=True)

    return df


US_EVAL_ZERO_SHOT = get_data_us_eval(eval_mode="zero_shot")
US_EVAL_FIVE_SHOT = get_data_us_eval(eval_mode="five_shot")


# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =


def get_data_cn_eval(eval_mode='zero_shot', fillna=True, rank=True):

    df_list = []
    
    for model in MODEL_LIST:
       
        try:
            results_list = [ALL_RESULTS[model][eval_mode]['cn_eval'][res] for res in ALL_RESULTS[model][eval_mode]['cn_eval']]
            accuracy = median([results['accuracy'] for results in results_list])

            res = {
                "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
                "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
                "Accuracy": accuracy,
            }

            df_list.append(res)

        except:
            print('Not found in model: {} for {}'.format(model, "cn_eval"))

    df = pd.DataFrame(df_list)
    # If there are any models that are the same, merge them
    # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
    df = df.groupby("Model", as_index=False).first()
    # Put 'Model' column first
    #cols = sorted(list(df.columns))
    cols = list(df.columns)
    cols.insert(0, cols.pop(cols.index("Model")))
    df = df[cols]

    if rank:
        df = add_rank(df, compute_average=True)       

    if fillna:
        df.fillna("", inplace=True)

    return df

CN_EVAL_ZERO_SHOT = get_data_cn_eval(eval_mode="zero_shot")
CN_EVAL_FIVE_SHOT = get_data_cn_eval(eval_mode="five_shot")


# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =

def get_data_ph_eval(eval_mode='zero_shot', fillna=True, rank=True):

    df_list = []
    
    for model in MODEL_LIST:


        try:
            results_list = [ALL_RESULTS[model][eval_mode]['ph_eval'][res] for res in ALL_RESULTS[model][eval_mode]['ph_eval']]
            accuracy = median([results['accuracy'] for results in results_list])
            res = {
                "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
                "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
                "Accuracy": accuracy,
            }

            df_list.append(res)

        except:
            print('Not found in model: {} for {}'.format(model, "ph_eval"))


    df = pd.DataFrame(df_list)
    # If there are any models that are the same, merge them
    # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
    df = df.groupby("Model", as_index=False).first()
    # Put 'Model' column first
    #cols = sorted(list(df.columns))
    cols = list(df.columns)
    cols.insert(0, cols.pop(cols.index("Model")))
    df = df[cols]

    if rank:
        df = add_rank(df, compute_average=True)       

    if fillna:
        df.fillna("", inplace=True)

    return df


PH_EVAL_ZERO_SHOT = get_data_ph_eval(eval_mode="zero_shot")
PH_EVAL_FIVE_SHOT = get_data_ph_eval(eval_mode="five_shot")


# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =


def get_data_sing2eng(eval_mode='zero_shot', fillna=True, rank=True):

    df_list = []
    
    for model in MODEL_LIST:
       
        try:
            results_list = [ALL_RESULTS[model][eval_mode]['sing2eng'][res] for res in ALL_RESULTS[model][eval_mode]['sing2eng']]
            bleu_score = median([results['bleu_score'] for results in results_list])

            res = {
                "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
                "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
                "BLEU": bleu_score,
            }

            df_list.append(res)

        except:
            print('Not found in model: {} for {}'.format(model, "sing2eng"))


    df = pd.DataFrame(df_list)
    # If there are any models that are the same, merge them
    # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
    df = df.groupby("Model", as_index=False).first()
    # Put 'Model' column first
    #cols = sorted(list(df.columns))
    cols = list(df.columns)
    cols.insert(0, cols.pop(cols.index("Model")))
    df = df[cols]

    if rank:
        df = add_rank(df, compute_average=True)       

    if fillna:
        df.fillna("", inplace=True)

    return df


SING2ENG_ZERO_SHOT = get_data_sing2eng(eval_mode="zero_shot")
SING2ENG_FIVE_SHOT = get_data_sing2eng(eval_mode="five_shot")

# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =


def get_data_flores_ind2eng(eval_mode='zero_shot', fillna=True, rank=True):

    df_list = []
    
    for model in MODEL_LIST:
       
        try:
            results_list = [ALL_RESULTS[model][eval_mode]['flores_ind2eng'][res] for res in ALL_RESULTS[model][eval_mode]['flores_ind2eng']]
            bleu_score = median([results['bleu_score'] for results in results_list])

            res = {
                "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
                "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
                "BLEU": bleu_score,
            }

            df_list.append(res)

        except:
            print('Not found in model: {} for {}'.format(model, "flores_ind2eng"))


    df = pd.DataFrame(df_list)
    # If there are any models that are the same, merge them
    # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
    df = df.groupby("Model", as_index=False).first()
    # Put 'Model' column first
    #cols = sorted(list(df.columns))
    cols = list(df.columns)
    cols.insert(0, cols.pop(cols.index("Model")))
    df = df[cols]

    if rank:
        df = add_rank(df, compute_average=True)       

    if fillna:
        df.fillna("", inplace=True)

    return df


FLORES_IND2ENG_ZERO_SHOT = get_data_flores_ind2eng(eval_mode="zero_shot")
FLORES_IND2ENG_FIVE_SHOT = get_data_flores_ind2eng(eval_mode="five_shot")


# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =


def get_data_flores_vie2eng(eval_mode='zero_shot', fillna=True, rank=True):

    df_list = []
    
    for model in MODEL_LIST:
       
        try:
            results_list = [ALL_RESULTS[model][eval_mode]['flores_vie2eng'][res] for res in ALL_RESULTS[model][eval_mode]['flores_vie2eng']]
            bleu_score = median([results['bleu_score'] for results in results_list])

            res = {
                "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
                "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
                "BLEU": bleu_score,
            }

            df_list.append(res)

        except:
            print('Not found in model: {} for {}'.format(model, "flores_vie2eng"))

    df = pd.DataFrame(df_list)
    # If there are any models that are the same, merge them
    # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
    df = df.groupby("Model", as_index=False).first()
    # Put 'Model' column first
    #cols = sorted(list(df.columns))
    cols = list(df.columns)
    cols.insert(0, cols.pop(cols.index("Model")))
    df = df[cols]

    if rank:
        df = add_rank(df, compute_average=True)       

    if fillna:
        df.fillna("", inplace=True)

    return df


FLORES_VIE2ENG_ZERO_SHOT = get_data_flores_vie2eng(eval_mode="zero_shot")
FLORES_VIE2ENG_FIVE_SHOT = get_data_flores_vie2eng(eval_mode="five_shot")

# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =


def get_data_flores_zho2eng(eval_mode='zero_shot', fillna=True, rank=True):

    df_list = []
    
    for model in MODEL_LIST:
       
        try:
            results_list = [ALL_RESULTS[model][eval_mode]['flores_zho2eng'][res] for res in ALL_RESULTS[model][eval_mode]['flores_zho2eng']]
            bleu_score = median([results['bleu_score'] for results in results_list])

            res = {
                "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
                "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
                "BLEU": bleu_score,
            }

            df_list.append(res)

        except:
            print('Not found in model: {} for {}'.format(model, "flores_zho2eng"))

    df = pd.DataFrame(df_list)
    # If there are any models that are the same, merge them
    # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
    df = df.groupby("Model", as_index=False).first()
    # Put 'Model' column first
    #cols = sorted(list(df.columns))
    cols = list(df.columns)
    cols.insert(0, cols.pop(cols.index("Model")))
    df = df[cols]

    if rank:
        df = add_rank(df, compute_average=True)       

    if fillna:
        df.fillna("", inplace=True)

    return df

FLORES_ZHO2ENG_ZERO_SHOT = get_data_flores_zho2eng(eval_mode="zero_shot")
FLORES_ZHO2ENG_FIVE_SHOT = get_data_flores_zho2eng(eval_mode="five_shot")


# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =


def get_data_flores_zsm2eng(eval_mode='zero_shot', fillna=True, rank=True):

    df_list = []
    
    for model in MODEL_LIST:
    
        try:
            results_list = [ALL_RESULTS[model][eval_mode]['flores_zsm2eng'][res] for res in ALL_RESULTS[model][eval_mode]['flores_zsm2eng']]
            bleu_score = median([results['bleu_score'] for results in results_list])

            res = {
                "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
                "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
                "BLEU": bleu_score,
            }
            df_list.append(res)

        except:
            print('Not found in model: {} for {}'.format(model, "flores_zsm2eng"))

    df = pd.DataFrame(df_list)
    # If there are any models that are the same, merge them
    # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
    df = df.groupby("Model", as_index=False).first()
    # Put 'Model' column first
    #cols = sorted(list(df.columns))
    cols = list(df.columns)
    cols.insert(0, cols.pop(cols.index("Model")))
    df = df[cols]

    if rank:
        df = add_rank(df, compute_average=True)       

    if fillna:
        df.fillna("", inplace=True)

    return df

FLORES_ZSM2ENG_ZERO_SHOT = get_data_flores_zho2eng(eval_mode="zero_shot")
FLORES_ZSM2ENG_FIVE_SHOT = get_data_flores_zho2eng(eval_mode="five_shot")


# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =


def get_data_mmlu(eval_mode='zero_shot', fillna=True, rank=True):

    df_list = []
    
    for model in MODEL_LIST:
      
        try:
            results_list = [ALL_RESULTS[model][eval_mode]['mmlu'][res] for res in ALL_RESULTS[model][eval_mode]['mmlu']]
            accuracy = median([results['accuracy'] for results in results_list])

            res = {
                "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
                "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
                "Accuracy": accuracy,
            }
            df_list.append(res)

        except:
            accuracy = -1

    df = pd.DataFrame(df_list)
    # If there are any models that are the same, merge them
    # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
    df = df.groupby("Model", as_index=False).first()
    # Put 'Model' column first
    #cols = sorted(list(df.columns))
    cols = list(df.columns)
    cols.insert(0, cols.pop(cols.index("Model")))
    df = df[cols]

    if rank:
        df = add_rank(df, compute_average=True)       

    if fillna:
        df.fillna("", inplace=True)

    return df


MMLU_ZERO_SHOT = get_data_mmlu(eval_mode="zero_shot")
MMLU_FIVE_SHOT = get_data_mmlu(eval_mode="five_shot")


# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
def get_data_mmlu_full(eval_mode='zero_shot', fillna=True, rank=True):

    df_list = []
    
    for model in MODEL_LIST:
      
        try:
            results_list = [ALL_RESULTS[model][eval_mode]['mmlu_full'][res] for res in ALL_RESULTS[model][eval_mode]['mmlu_full']]
            accuracy = median([results['accuracy'] for results in results_list])

            res = {
                "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
                "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
                "Accuracy": accuracy,
            }

            df_list.append(res)

        except:
            print('Not found in model: {} for {}'.format(model, "mmlu_full"))


    df = pd.DataFrame(df_list)
    # If there are any models that are the same, merge them
    # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
    df = df.groupby("Model", as_index=False).first()
    # Put 'Model' column first
    #cols = sorted(list(df.columns))
    cols = list(df.columns)
    cols.insert(0, cols.pop(cols.index("Model")))
    df = df[cols]

    if rank:
        df = add_rank(df, compute_average=True)       

    if fillna:
        df.fillna("", inplace=True)

    return df

MMLU_FULL_ZERO_SHOT = get_data_mmlu_full(eval_mode="zero_shot")
MMLU_FULL_FIVE_SHOT = get_data_mmlu_full(eval_mode="five_shot")


# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
def get_data_c_eval(eval_mode='zero_shot', fillna=True, rank=True):

    df_list = []
    
    for model in MODEL_LIST:       
        try:
            results_list = [ALL_RESULTS[model][eval_mode]['c_eval'][res] for res in ALL_RESULTS[model][eval_mode]['c_eval']]
            accuracy = median([results['accuracy'] for results in results_list])

            res = {
                "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
                "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
                "Accuracy": accuracy,
            }

            df_list.append(res)

        except:
            print('Not found in model: {} for {}'.format(model, "c_eval"))

    df = pd.DataFrame(df_list)
    # If there are any models that are the same, merge them
    # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
    df = df.groupby("Model", as_index=False).first()
    # Put 'Model' column first
    #cols = sorted(list(df.columns))
    cols = list(df.columns)
    cols.insert(0, cols.pop(cols.index("Model")))
    df = df[cols]

    if rank:
        df = add_rank(df, compute_average=True)       

    if fillna:
        df.fillna("", inplace=True)

    return df

C_EVAL_ZERO_SHOT = get_data_c_eval(eval_mode="zero_shot")
C_EVAL_FIVE_SHOT = get_data_c_eval(eval_mode="five_shot")


# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =


def get_data_c_eval_full(eval_mode='zero_shot', fillna=True, rank=True):

    df_list = []
    
    for model in MODEL_LIST:
    
        try:
            results_list = [ALL_RESULTS[model][eval_mode]['c_eval_full'][res] for res in ALL_RESULTS[model][eval_mode]['c_eval_full']]
            accuracy = median([results['accuracy'] for results in results_list])

            res = {
                "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
                "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
                "Accuracy": accuracy,
            }

            df_list.append(res)

        except:
            print('Not found in model: {} for {}'.format(model, "c_eval_full"))


    df = pd.DataFrame(df_list)
    # If there are any models that are the same, merge them
    # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
    df = df.groupby("Model", as_index=False).first()
    # Put 'Model' column first
    #cols = sorted(list(df.columns))
    cols = list(df.columns)
    cols.insert(0, cols.pop(cols.index("Model")))
    df = df[cols]

    if rank:
        df = add_rank(df, compute_average=True)       

    if fillna:
        df.fillna("", inplace=True)

    return df


C_EVAL_FULL_ZERO_SHOT = get_data_c_eval_full(eval_mode="zero_shot")
C_EVAL_FULL_FIVE_SHOT = get_data_c_eval_full(eval_mode="five_shot")


# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =


def get_data_cmmlu(eval_mode='zero_shot', fillna=True, rank=True):

    df_list = []
    
    for model in MODEL_LIST:
      
        try:
            results_list = [ALL_RESULTS[model][eval_mode]['cmmlu'][res] for res in ALL_RESULTS[model][eval_mode]['cmmlu']]
            accuracy = median([results['accuracy'] for results in results_list])

            res = {
                "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
                "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
                "Accuracy": accuracy,
            }

            df_list.append(res)

        except:
            print('Not found in model: {} for {}'.format(model, "cmmlu"))


    df = pd.DataFrame(df_list)
    # If there are any models that are the same, merge them
    # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
    df = df.groupby("Model", as_index=False).first()
    # Put 'Model' column first
    #cols = sorted(list(df.columns))
    cols = list(df.columns)
    cols.insert(0, cols.pop(cols.index("Model")))
    df = df[cols]

    if rank:
        df = add_rank(df, compute_average=True)       

    if fillna:
        df.fillna("", inplace=True)

    return df


CMMLU_ZERO_SHOT = get_data_cmmlu(eval_mode="zero_shot")
CMMLU_FIVE_SHOT = get_data_cmmlu(eval_mode="five_shot")


# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =


def get_data_cmmlu_full(eval_mode='zero_shot', fillna=True, rank=True):

    df_list = []
    
    for model in MODEL_LIST:
       
        try:
            results_list = [ALL_RESULTS[model][eval_mode]['cmmlu_full'][res] for res in ALL_RESULTS[model][eval_mode]['cmmlu_full']]
            accuracy = median([results['accuracy'] for results in results_list])

            res = {
                "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
                "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
                "Accuracy": accuracy,
            }

            df_list.append(res)

        except:
            print('Not found in model: {} for {}'.format(model, "cmmlu_full"))


    df = pd.DataFrame(df_list)
    # If there are any models that are the same, merge them
    # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
    df = df.groupby("Model", as_index=False).first()
    # Put 'Model' column first
    #cols = sorted(list(df.columns))
    cols = list(df.columns)
    cols.insert(0, cols.pop(cols.index("Model")))
    df = df[cols]

    if rank:
        df = add_rank(df, compute_average=True)       

    if fillna:
        df.fillna("", inplace=True)

    return df


CMMLU_FULL_ZERO_SHOT = get_data_cmmlu_full(eval_mode="zero_shot")
CMMLU_FULL_FIVE_SHOT = get_data_cmmlu_full(eval_mode="five_shot")


# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =


def get_data_zbench(eval_mode='zero_shot', fillna=True, rank=True):

    df_list = []
    
    for model in MODEL_LIST:
        try:
            results_list = [ALL_RESULTS[model][eval_mode]['zbench'][res] for res in ALL_RESULTS[model][eval_mode]['zbench']]
            accuracy = median([results['accuracy'] for results in results_list])

            res = {
                "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
                "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
                "Accuracy": accuracy,
            }

            df_list.append(res)

        except:
            print('Not found in model: {} for {}'.format(model, "zbench"))


    df = pd.DataFrame(df_list)
    # If there are any models that are the same, merge them
    # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
    df = df.groupby("Model", as_index=False).first()
    # Put 'Model' column first
    #cols = sorted(list(df.columns))
    cols = list(df.columns)
    cols.insert(0, cols.pop(cols.index("Model")))
    df = df[cols]

    if rank:
        df = add_rank(df, compute_average=True)       

    if fillna:
        df.fillna("", inplace=True)

    return df


ZBENCH_ZERO_SHOT = get_data_zbench(eval_mode="zero_shot")
ZBENCH_FIVE_SHOT = get_data_zbench(eval_mode="five_shot")

# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =


def get_data_indommlu(eval_mode='zero_shot', fillna=True, rank=True):

    df_list = []
    
    for model in MODEL_LIST:

       
        try:
            results_list = [ALL_RESULTS[model][eval_mode]['indommlu'][res] for res in ALL_RESULTS[model][eval_mode]['indommlu']]
            accuracy = median([results['accuracy'] for results in results_list])

            res = {
                "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
                "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
                "Accuracy": accuracy,
            }

            df_list.append(res)

        except:
            print('Not found in model: {} for {}'.format(model, "indommlu"))


    df = pd.DataFrame(df_list)
    # If there are any models that are the same, merge them
    # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
    df = df.groupby("Model", as_index=False).first()
    # Put 'Model' column first
    #cols = sorted(list(df.columns))
    cols = list(df.columns)
    cols.insert(0, cols.pop(cols.index("Model")))
    df = df[cols]

    if rank:
        df = add_rank(df, compute_average=True)       

    if fillna:
        df.fillna("", inplace=True)

    return df


INDOMMLU_ZERO_SHOT = get_data_indommlu(eval_mode="zero_shot")
INDOMMLU_FIVE_SHOT = get_data_indommlu(eval_mode="five_shot")


# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
def get_data_ind_emotion(eval_mode='zero_shot', fillna=True, rank=True):

    df_list = []
    
    for model in MODEL_LIST:
        try:
            results_list = [ALL_RESULTS[model][eval_mode]['ind_emotion'][res] for res in ALL_RESULTS[model][eval_mode]['ind_emotion']]
            accuracy = median([results['accuracy'] for results in results_list])

            res = {
                "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
                "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
                "Accuracy": accuracy,
            }

            df_list.append(res)

        except:
            print('Not found in model: {} for {}'.format(model, "ind_emotion"))

    df = pd.DataFrame(df_list)
    # If there are any models that are the same, merge them
    # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
    df = df.groupby("Model", as_index=False).first()
    # Put 'Model' column first
    #cols = sorted(list(df.columns))
    cols = list(df.columns)
    cols.insert(0, cols.pop(cols.index("Model")))
    df = df[cols]

    if rank:
        df = add_rank(df, compute_average=True)       

    if fillna:
        df.fillna("", inplace=True)

    return df

IND_EMOTION_ZERO_SHOT = get_data_ind_emotion(eval_mode="zero_shot")
IND_EMOTION_FIVE_SHOT = get_data_ind_emotion(eval_mode="five_shot")


# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =


def get_data_ocnli(eval_mode='zero_shot', fillna=True, rank=True):

    df_list = []
    
    for model in MODEL_LIST:
       
        try:
            results_list = [ALL_RESULTS[model][eval_mode]['ocnli'][res] for res in ALL_RESULTS[model][eval_mode]['ocnli']]
            accuracy = median([results['accuracy'] for results in results_list])

            res = {
                "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
                "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
                "Accuracy": accuracy,
            }

            df_list.append(res)

        except:
            print('Not found in model: {} for {}'.format(model, "ocnli"))


    df = pd.DataFrame(df_list)
    # If there are any models that are the same, merge them
    # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
    df = df.groupby("Model", as_index=False).first()
    # Put 'Model' column first
    #cols = sorted(list(df.columns))
    cols = list(df.columns)
    cols.insert(0, cols.pop(cols.index("Model")))
    df = df[cols]

    if rank:
        df = add_rank(df, compute_average=True)       

    if fillna:
        df.fillna("", inplace=True)

    return df


OCNLI_ZERO_SHOT = get_data_ocnli(eval_mode="zero_shot")
OCNLI_FIVE_SHOT = get_data_ocnli(eval_mode="five_shot")


# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =


def get_data_c3(eval_mode='zero_shot', fillna=True, rank=True):

    df_list = []
    
    for model in MODEL_LIST:
       
        try:
            results_list = [ALL_RESULTS[model][eval_mode]['c3'][res] for res in ALL_RESULTS[model][eval_mode]['c3']]
            accuracy = median([results['accuracy'] for results in results_list])

            res = {
                "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
                "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
                "Accuracy": accuracy,
            }

            df_list.append(res)

        except:
            print('Not found in model: {} for {}'.format(model, "c3"))

    df = pd.DataFrame(df_list)
    # If there are any models that are the same, merge them
    # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
    df = df.groupby("Model", as_index=False).first()
    # Put 'Model' column first
    #cols = sorted(list(df.columns))
    cols = list(df.columns)
    cols.insert(0, cols.pop(cols.index("Model")))
    df = df[cols]

    if rank:
        df = add_rank(df, compute_average=True)       

    if fillna:
        df.fillna("", inplace=True)

    return df


C3_ZERO_SHOT = get_data_c3(eval_mode="zero_shot")
C3_FIVE_SHOT = get_data_c3(eval_mode="five_shot")


# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =


def get_data_dream(eval_mode='zero_shot', fillna=True, rank=True):

    df_list = []
    
    for model in MODEL_LIST:
       
        try:
            results_list = [ALL_RESULTS[model][eval_mode]['dream'][res] for res in ALL_RESULTS[model][eval_mode]['dream']]
            accuracy = median([results['accuracy'] for results in results_list])

            res = {
                "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
                "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
                "Accuracy": accuracy,
            }

            df_list.append(res)

        except:
            print('Not found in model: {} for {}'.format(model, "dream"))


    df = pd.DataFrame(df_list)
    # If there are any models that are the same, merge them
    # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
    df = df.groupby("Model", as_index=False).first()
    # Put 'Model' column first
    #cols = sorted(list(df.columns))
    cols = list(df.columns)
    cols.insert(0, cols.pop(cols.index("Model")))
    df = df[cols]

    if rank:
        df = add_rank(df, compute_average=True)       

    if fillna:
        df.fillna("", inplace=True)

    return df

DREAM_ZERO_SHOT = get_data_dream(eval_mode="zero_shot")
DREAM_FIVE_SHOT = get_data_dream(eval_mode="five_shot")

# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
def get_data_samsum(eval_mode='zero_shot', fillna=True, rank=True):

    df_list = []
    
    for model in MODEL_LIST:
       
        try:
            results_list = [ALL_RESULTS[model][eval_mode]['samsum'][res] for res in ALL_RESULTS[model][eval_mode]['samsum']]

            rouge1 = median([results['rouge1'] for results in results_list])
            rouge2 = median([results['rouge2'] for results in results_list])
            rougeL = median([results['rougeL'] for results in results_list])

            res = {
                "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
                "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
                "ROUGE-1": rouge1,
                "ROUGE-2": rouge2,
                "ROUGE-L": rougeL,
            }

            df_list.append(res)

        except:
            print('Not found in model: {} for {}'.format(model, "samsum"))

    df = pd.DataFrame(df_list)
    # If there are any models that are the same, merge them
    # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
    df = df.groupby("Model", as_index=False).first()
    # Put 'Model' column first
    #cols = sorted(list(df.columns))
    cols = list(df.columns)
    cols.insert(0, cols.pop(cols.index("Model")))
    df = df[cols]

    if rank:
        df = add_rank(df, compute_average=True)       

    if fillna:
        df.fillna("", inplace=True)

    return df


SAMSUM_ZERO_SHOT = get_data_samsum(eval_mode="zero_shot")
SAMSUM_FIVE_SHOT = get_data_samsum(eval_mode="five_shot")


# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =


def get_data_dialogsum(eval_mode='zero_shot', fillna=True, rank=True):

    df_list = []
    
    for model in MODEL_LIST:
      
        try:
            results_list = [ALL_RESULTS[model][eval_mode]['dialogsum'][res] for res in ALL_RESULTS[model][eval_mode]['dialogsum']]

            rouge1 = median([results['rouge1'] for results in results_list])
            rouge2 = median([results['rouge2'] for results in results_list])
            rougeL = median([results['rougeL'] for results in results_list])

            res = {
                "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
                "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
                "ROUGE-1": rouge1,
                "ROUGE-2": rouge2,
                "ROUGE-L": rougeL,
            }

            df_list.append(res)

        except:
            print('Not found in model: {} for {}'.format(model, "dialogsum"))


    df = pd.DataFrame(df_list)
    # If there are any models that are the same, merge them
    # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
    df = df.groupby("Model", as_index=False).first()
    # Put 'Model' column first
    #cols = sorted(list(df.columns))
    cols = list(df.columns)
    cols.insert(0, cols.pop(cols.index("Model")))
    df = df[cols]

    if rank:
        df = add_rank(df, compute_average=True)       

    if fillna:
        df.fillna("", inplace=True)

    return df


DIALOGSUM_ZERO_SHOT = get_data_dialogsum(eval_mode="zero_shot")
DIALOGSUM_FIVE_SHOT = get_data_dialogsum(eval_mode="five_shot")


# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =


def get_data_sst2(eval_mode='zero_shot', fillna=True, rank=True):

    df_list = []
    
    for model in MODEL_LIST:

        try:
            results_list = [ALL_RESULTS[model][eval_mode]['sst2'][res] for res in ALL_RESULTS[model][eval_mode]['sst2']]
            accuracy = median([results['accuracy'] for results in results_list])

            res = {
                "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
                "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
                "Accuracy": accuracy,
            }

            df_list.append(res)

        except:
            print('Not found in model: {} for {}'.format(model, "sst2"))


    df = pd.DataFrame(df_list)
    # If there are any models that are the same, merge them
    # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
    df = df.groupby("Model", as_index=False).first()
    # Put 'Model' column first
    #cols = sorted(list(df.columns))
    cols = list(df.columns)
    cols.insert(0, cols.pop(cols.index("Model")))
    df = df[cols]

    if rank:
        df = add_rank(df, compute_average=True)       

    if fillna:
        df.fillna("", inplace=True)

    return df


SST2_ZERO_SHOT = get_data_sst2(eval_mode="zero_shot")
SST2_FIVE_SHOT = get_data_sst2(eval_mode="five_shot")


# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =


def get_data_cola(eval_mode='zero_shot', fillna=True, rank=True):

    df_list = []
    
    for model in MODEL_LIST:
       
        try:
            results_list = [ALL_RESULTS[model][eval_mode]['cola'][res] for res in ALL_RESULTS[model][eval_mode]['cola']]
            accuracy = median([results['accuracy'] for results in results_list])

            res = {
                "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
                "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
                "Accuracy": accuracy,
            }

            df_list.append(res)

        except:
            print('Not found in model: {} for {}'.format(model, "cola"))

    df = pd.DataFrame(df_list)
    # If there are any models that are the same, merge them
    # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
    df = df.groupby("Model", as_index=False).first()
    # Put 'Model' column first
    #cols = sorted(list(df.columns))
    cols = list(df.columns)
    cols.insert(0, cols.pop(cols.index("Model")))
    df = df[cols]

    if rank:
        df = add_rank(df, compute_average=True)       

    if fillna:
        df.fillna("", inplace=True)

    return df


COLA_ZERO_SHOT = get_data_cola(eval_mode="zero_shot")
COLA_FIVE_SHOT = get_data_cola(eval_mode="five_shot")


# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =


def get_data_qqp(eval_mode='zero_shot', fillna=True, rank=True):

    df_list = []
    
    for model in MODEL_LIST:

        try:
            results_list = [ALL_RESULTS[model][eval_mode]['qqp'][res] for res in ALL_RESULTS[model][eval_mode]['qqp']]
            accuracy = median([results['accuracy'] for results in results_list])

            res = {
                "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
                "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
                "Accuracy": accuracy,
            }

            df_list.append(res)

        except:
            print('Not found in model: {} for {}'.format(model, "qqp"))


    df = pd.DataFrame(df_list)
    # If there are any models that are the same, merge them
    # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
    df = df.groupby("Model", as_index=False).first()
    # Put 'Model' column first
    #cols = sorted(list(df.columns))
    cols = list(df.columns)
    cols.insert(0, cols.pop(cols.index("Model")))
    df = df[cols]

    if rank:
        df = add_rank(df, compute_average=True)       

    if fillna:
        df.fillna("", inplace=True)

    return df


QQP_ZERO_SHOT = get_data_qqp(eval_mode="zero_shot")
QQP_FIVE_SHOT = get_data_qqp(eval_mode="five_shot")


# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =


def get_data_mnli(eval_mode='zero_shot', fillna=True, rank=True):

    df_list = []
    
    for model in MODEL_LIST:
      
        try:
            results_list = [ALL_RESULTS[model][eval_mode]['mnli'][res] for res in ALL_RESULTS[model][eval_mode]['mnli']]
            accuracy = median([results['accuracy'] for results in results_list])

            res = {
                "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
                "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
                "Accuracy": accuracy,
            }

            df_list.append(res)

        except:
            print('Not found in model: {} for {}'.format(model, "mnli"))


    df = pd.DataFrame(df_list)
    # If there are any models that are the same, merge them
    # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
    df = df.groupby("Model", as_index=False).first()
    # Put 'Model' column first
    #cols = sorted(list(df.columns))
    cols = list(df.columns)
    cols.insert(0, cols.pop(cols.index("Model")))
    df = df[cols]

    if rank:
        df = add_rank(df, compute_average=True)       

    if fillna:
        df.fillna("", inplace=True)

    return df


MNLI_ZERO_SHOT = get_data_mnli(eval_mode="zero_shot")
MNLI_FIVE_SHOT = get_data_mnli(eval_mode="five_shot")


# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =


def get_data_qnli(eval_mode='zero_shot', fillna=True, rank=True):

    df_list = []
    
    for model in MODEL_LIST:
       
        try:
            results_list = [ALL_RESULTS[model][eval_mode]['qnli'][res] for res in ALL_RESULTS[model][eval_mode]['qnli']]
            accuracy = median([results['accuracy'] for results in results_list])

            res = {
                "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
                "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
                "Accuracy": accuracy,
            }

            df_list.append(res)

        except:
            print('Not found in model: {} for {}'.format(model, "qnli"))

    df = pd.DataFrame(df_list)
    # If there are any models that are the same, merge them
    # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
    df = df.groupby("Model", as_index=False).first()
    # Put 'Model' column first
    #cols = sorted(list(df.columns))
    cols = list(df.columns)
    cols.insert(0, cols.pop(cols.index("Model")))
    df = df[cols]

    if rank:
        df = add_rank(df, compute_average=True)       

    if fillna:
        df.fillna("", inplace=True)

    return df


QNLI_ZERO_SHOT = get_data_qnli(eval_mode="zero_shot")
QNLI_FIVE_SHOT = get_data_qnli(eval_mode="five_shot")


# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =


def get_data_wnli(eval_mode='zero_shot', fillna=True, rank=True):

    df_list = []
    
    for model in MODEL_LIST:
       
        try:
            results_list = [ALL_RESULTS[model][eval_mode]['wnli'][res] for res in ALL_RESULTS[model][eval_mode]['wnli']]
            accuracy = median([results['accuracy'] for results in results_list])

            res = {
                "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
                "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
                "Accuracy": accuracy,
            }

            df_list.append(res)

        except:
            print('Not found in model: {} for {}'.format(model, "wnli"))

    df = pd.DataFrame(df_list)
    # If there are any models that are the same, merge them
    # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
    df = df.groupby("Model", as_index=False).first()
    # Put 'Model' column first
    #cols = sorted(list(df.columns))
    cols = list(df.columns)
    cols.insert(0, cols.pop(cols.index("Model")))
    df = df[cols]

    if rank:
        df = add_rank(df, compute_average=True)       

    if fillna:
        df.fillna("", inplace=True)

    return df

WNLI_ZERO_SHOT = get_data_wnli(eval_mode="zero_shot")
WNLI_FIVE_SHOT = get_data_wnli(eval_mode="five_shot")


# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =


def get_data_rte(eval_mode='zero_shot', fillna=True, rank=True):

    df_list = []
    
    for model in MODEL_LIST:
        try:
            results_list = [ALL_RESULTS[model][eval_mode]['rte'][res] for res in ALL_RESULTS[model][eval_mode]['rte']]
            accuracy = median([results['accuracy'] for results in results_list])

            res = {
                "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
                "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
                "Accuracy": accuracy,
            }

            df_list.append(res)

        except:
            print('Not found in model: {} for {}'.format(model, "rte"))

    df = pd.DataFrame(df_list)
    # If there are any models that are the same, merge them
    # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
    df = df.groupby("Model", as_index=False).first()
    # Put 'Model' column first
    #cols = sorted(list(df.columns))
    cols = list(df.columns)
    cols.insert(0, cols.pop(cols.index("Model")))
    df = df[cols]

    if rank:
        df = add_rank(df, compute_average=True)       

    if fillna:
        df.fillna("", inplace=True)

    return df


RTE_ZERO_SHOT = get_data_rte(eval_mode="zero_shot")
RTE_FIVE_SHOT = get_data_rte(eval_mode="five_shot")


# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
def get_data_mrpc(eval_mode='zero_shot', fillna=True, rank=True):

    df_list = []
    
    for model in MODEL_LIST:
      
        try:
            results_list = [ALL_RESULTS[model][eval_mode]['mrpc'][res] for res in ALL_RESULTS[model][eval_mode]['mrpc']]
            accuracy = median([results['accuracy'] for results in results_list])

            res = {
                "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
                "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
                "Accuracy": accuracy,
            }

            df_list.append(res)

        except:
            print('Not found in model: {} for {}'.format(model, "mrpc"))

    df = pd.DataFrame(df_list)
    # If there are any models that are the same, merge them
    # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
    df = df.groupby("Model", as_index=False).first()
    # Put 'Model' column first
    #cols = sorted(list(df.columns))
    cols = list(df.columns)
    cols.insert(0, cols.pop(cols.index("Model")))
    df = df[cols]

    if rank:
        df = add_rank(df, compute_average=True)       

    if fillna:
        df.fillna("", inplace=True)

    return df


MRPC_ZERO_SHOT = get_data_mrpc(eval_mode="zero_shot")
MRPC_FIVE_SHOT = get_data_mrpc(eval_mode="five_shot")


# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =


theme = gr.themes.Soft().set(
    background_fill_primary='*secondary_50'
)

block = gr.Blocks(theme='rottenlittlecreature/Moon_Goblin')


with block:
    gr.Markdown(f"""
    ### SeaEval Leaderboard. To submit, refer to the <a href="https://seaeval.github.io/" target="_blank" style="text-decoration: underline">SeaEval Website</a>.  Refer to the [SeaEval paper](https://arxiv.org/abs/2309.04766) for details on metrics, tasks and models.
    - **Number of Datasets**: > 30 
    - **Number of Languages**: > 8
    - **Number of Models**: {NUM_MODELS}
    - **Mode of Evaluation**: Zero-Shot, Five-Shot

    ### The following table shows the performance of the models on the SeaEval benchmark.
    - For **Zero-Shot** performance, it is the median value from 5 distinct prompts shown on the above leaderboard to mitigate the influence of random variations induced by prompts.
    - I am trying to evaluate the base models for five-shot performance and instruction-tuned models for zero-shot.
    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

    """)

    with gr.Tabs():
        with gr.TabItem("Cross-Lingual Consistency"):

            # dataset 1: cross-mmlu


            # dataset 1: cross-mmlu
            with gr.TabItem("Cross-MMLU"):
                with gr.TabItem("Zero Shot"):
                    with gr.TabItem("Overall"):
                        with gr.Row():
                            cross_mmlu_zero_shot_overall = gr.components.Dataframe(
                                CROSS_MMLU_ZERO_SHOT_OVERALL,
                                datatype=["number", "markdown"] + ["number"] * len(CROSS_MMLU_ZERO_SHOT_OVERALL.columns),
                                type="pandas",
                            )
                    with gr.TabItem("Language Performance"):

                        with gr.Row():
                            cross_mmlu_zero_shot_overall = gr.components.Dataframe(
                                CROSS_MMLU_ZERO_SHOT_LANGUAGE,
                                datatype=["number", "markdown"] + ["number"] * len(CROSS_MMLU_ZERO_SHOT_LANGUAGE.columns),
                                type="pandas",
                            )
                with gr.TabItem("Five Shot"):
                    with gr.TabItem("Overall"):

                        with gr.Row():
                            cross_mmlu_zero_shot_overall = gr.components.Dataframe(
                                CROSS_MMLU_FIVE_SHOT_OVERALL,
                                datatype=["number", "markdown"] + ["number"] * len(CROSS_MMLU_FIVE_SHOT_OVERALL.columns),
                                type="pandas",
                            )
                    with gr.TabItem("Language Performance"):

                        with gr.Row():
                            gr.components.Dataframe(
                                CROSS_MMLU_FIVE_SHOT_LANGUAGE,
                                datatype=["number", "markdown"] + ["number"] * len(CROSS_MMLU_FIVE_SHOT_LANGUAGE.columns),
                                type="pandas",
                            )

                with gr.Row():
                    gr.Markdown("""
                    **Cross-MMLU Leaderboard** 🔮
                    - **Metric:** Cross-Lingual Consistency, Accuracy, AC3
                    - **Languages:** English, Chinese, Malay, Indonesian, Spanish, Vietnamese, Filipino
                    """)


            with gr.TabItem("Cross-XQUAD"):
                with gr.TabItem("Zero Shot"):
                    with gr.TabItem("Overall"):
                        with gr.Row():
                            cross_xquad_zero_shot_overall = gr.components.Dataframe(
                                CROSS_XQUAD_ZERO_SHOT_OVERALL,
                                datatype=["number", "markdown"] + ["number"] * len(CROSS_XQUAD_ZERO_SHOT_OVERALL.columns),
                                type="pandas",
                            )
                    with gr.TabItem("Language Performance"):

                        with gr.Row():
                            cross_xquad_zero_shot_overall = gr.components.Dataframe(
                                CROSS_XQUAD_ZERO_SHOT_LANGUAGE,
                                datatype=["number", "markdown"] + ["number"] * len(CROSS_XQUAD_ZERO_SHOT_LANGUAGE.columns),
                                type="pandas",
                            )
                with gr.TabItem("Five Shot"):
                    with gr.TabItem("Overall"):

                        with gr.Row():
                            cross_xquad_zero_shot_overall = gr.components.Dataframe(
                                CROSS_XQUAD_FIVE_SHOT_OVERALL,
                                datatype=["number", "markdown"] + ["number"] * len(CROSS_XQUAD_FIVE_SHOT_OVERALL.columns),
                                type="pandas",
                            )
                    with gr.TabItem("Language Performance"):

                        with gr.Row():
                            gr.components.Dataframe(
                                CROSS_XQUAD_FIVE_SHOT_LANGUAGE,
                                datatype=["number", "markdown"] + ["number"] * len(CROSS_XQUAD_FIVE_SHOT_LANGUAGE.columns),
                                type="pandas",
                            )

                with gr.Row():
                    gr.Markdown("""
                    **Cross-XQUAD Leaderboard** 🔮
                    - **Metric:** Cross-Lingual Consistency, Accuracy, AC3
                    - **Languages:** English, Chinese, Spanish, Vietnamese
                    """)


            # dataset 2: cross-logiqa
            with gr.TabItem("Cross-LogiQA"):
                with gr.TabItem("Zero Shot"):
                    with gr.TabItem("Overall"):
                        with gr.Row():
                            gr.components.Dataframe(
                                CROSS_LOGIQA_ZERO_SHOT_OVERALL,
                                datatype=["number", "markdown"] + ["number"] * len(CROSS_LOGIQA_ZERO_SHOT_OVERALL.columns),
                                type="pandas",
                            )
                    with gr.TabItem("Language Performance"):

                        with gr.Row():
                            gr.components.Dataframe(
                                CROSS_LOGIQA_ZERO_SHOT_LANGUAGE,
                                datatype=["number", "markdown"] + ["number"] * len(CROSS_LOGIQA_ZERO_SHOT_LANGUAGE.columns),
                                type="pandas",
                            )
                with gr.TabItem("Five Shot"):
                    with gr.TabItem("Overall"):
                        with gr.Row():
                            gr.components.Dataframe(
                                CROSS_LOGIQA_FIVE_SHOT_OVERALL,
                                datatype=["number", "markdown"] + ["number"] * len(CROSS_LOGIQA_FIVE_SHOT_OVERALL.columns),
                                type="pandas",
                            )
                    with gr.TabItem("Language Performance"):
                        with gr.Row():
                            gr.components.Dataframe(
                                CROSS_LOGIQA_FIVE_SHOT_LANGUAGE,
                                datatype=["number", "markdown"] + ["number"] * len(CROSS_LOGIQA_FIVE_SHOT_LANGUAGE.columns),
                                type="pandas",
                            )
                with gr.Row():
                    gr.Markdown("""
                    **Cross-LogiQA Leaderboard** 🔮
                    - **Metric:** Cross-Lingual Consistency, Accuracy, AC3
                    - **Languages:** English, Chinese, Malay, Indonesian, Spanish, Vietnamese, Filipino
                    """)


        with gr.TabItem("Cultural Reasoning"):

            # dataset 3: SG_EVAL
            with gr.TabItem("SG_EVAL"):
                with gr.TabItem("Zero Shot"):
                    with gr.TabItem("Overall"):
                        with gr.Row():
                            gr.components.Dataframe(
                                SG_EVAL_ZERO_SHOT,
                                datatype=["number", "markdown"] + ["number"] * len(SG_EVAL_ZERO_SHOT.columns),
                                type="pandas",
                            )
                with gr.TabItem("Five Shot"):
                    with gr.TabItem("Overall"):
                        with gr.Row():
                            gr.components.Dataframe(
                                SG_EVAL_FIVE_SHOT,
                                datatype=["number", "markdown"] + ["number"] * len(SG_EVAL_FIVE_SHOT.columns),
                                type="pandas",
                            )
                with gr.Row():
                    gr.Markdown("""
                    **SG_EVAL Leaderboard** 🔮
                    - **Metric:** Accuracy
                    - **Languages:** English
                    """)


            # dataset 4: 
            with gr.TabItem("US_EVAL"):
                with gr.TabItem("Zero Shot"):
                    with gr.TabItem("Overall"):
                        with gr.Row():
                            gr.components.Dataframe(
                                US_EVAL_ZERO_SHOT,
                                datatype=["number", "markdown"] + ["number"] * len(US_EVAL_ZERO_SHOT.columns),
                                type="pandas",
                            )
                with gr.TabItem("Five Shot"):
                    with gr.TabItem("Overall"):
                        with gr.Row():
                            gr.components.Dataframe(
                                US_EVAL_FIVE_SHOT,
                                datatype=["number", "markdown"] + ["number"] * len(US_EVAL_FIVE_SHOT.columns),
                                type="pandas",
                            )
                with gr.Row():
                    gr.Markdown("""
                    **US_EVAL Leaderboard** 🔮
                    - **Metric:** Accuracy
                    - **Languages:** English
                    """)


            # dataset 5: 
            with gr.TabItem("CN_EVAL"):
                with gr.TabItem("Zero Shot"):
                    with gr.TabItem("Overall"):
                        with gr.Row():
                            gr.components.Dataframe(
                                CN_EVAL_ZERO_SHOT,
                                datatype=["number", "markdown"] + ["number"] * len(CN_EVAL_ZERO_SHOT.columns),
                                type="pandas",
                            )
                with gr.TabItem("Five Shot"):
                    with gr.TabItem("Overall"):
                        with gr.Row():
                            gr.components.Dataframe(
                                CN_EVAL_FIVE_SHOT,
                                datatype=["number", "markdown"] + ["number"] * len(CN_EVAL_FIVE_SHOT.columns),
                                type="pandas",
                            )
                with gr.Row():
                    gr.Markdown("""
                    **CN_EVAL Leaderboard** 🔮
                    - **Metric:** Accuracy
                    - **Languages:** Chinese
                    """)


            # dataset 6: 
            with gr.TabItem("PH_EVAL"):
                with gr.TabItem("Zero Shot"):
                    with gr.TabItem("Overall"):
                        with gr.Row():
                            gr.components.Dataframe(
                                PH_EVAL_ZERO_SHOT,
                                datatype=["number", "markdown"] + ["number"] * len(PH_EVAL_ZERO_SHOT.columns),
                                type="pandas",
                            )
                with gr.TabItem("Five Shot"):
                    with gr.TabItem("Overall"):
                        with gr.Row():
                            gr.components.Dataframe(
                                PH_EVAL_FIVE_SHOT,
                                datatype=["number", "markdown"] + ["number"] * len(PH_EVAL_FIVE_SHOT.columns),
                                type="pandas",
                            )
                with gr.Row():
                    gr.Markdown("""
                    **PH_EVAL Leaderboard** 🔮
                    - **Metric:** Accuracy
                    - **Languages:** English
                    """)


            # dataset 7: 
            with gr.TabItem("Singlish to English Translation"):
                with gr.TabItem("Zero Shot"):
                    with gr.TabItem("Overall"):
                        with gr.Row():
                            gr.components.Dataframe(
                                SING2ENG_ZERO_SHOT,
                                datatype=["number", "markdown"] + ["number"] * len(SING2ENG_ZERO_SHOT.columns),
                                type="pandas",
                            )
                with gr.TabItem("Five Shot"):
                    with gr.TabItem("Overall"):
                        with gr.Row():
                            gr.components.Dataframe(
                                SING2ENG_FIVE_SHOT,
                                datatype=["number", "markdown"] + ["number"] * len(SING2ENG_FIVE_SHOT.columns),
                                type="pandas",
                            )
                with gr.Row():
                    gr.Markdown("""
                    **SING2ENG Leaderboard** 🔮
                    - **Metric:** BLEU Avg.
                    - **Languages:** English
                    """)


        with gr.TabItem("General Reasoning"):


            # dataset 12:
            with gr.TabItem("MMLU Subset"):
                with gr.TabItem("Zero Shot"):
                    with gr.TabItem("Overall"):
                        with gr.Row():
                            gr.components.Dataframe(
                                MMLU_ZERO_SHOT,
                                datatype=["number", "markdown"] + ["number"] * len(MMLU_ZERO_SHOT.columns),
                                type="pandas",
                            )
                with gr.TabItem("Five Shot"):
                    with gr.TabItem("Overall"):
                        with gr.Row():
                            gr.components.Dataframe(
                                MMLU_FIVE_SHOT,
                                datatype=["number", "markdown"] + ["number"] * len(MMLU_FIVE_SHOT.columns),
                                type="pandas",
                            )
                with gr.Row():
                    gr.Markdown("""
                    **MMLU Leaderboard** 🔮
                    - **Metric:** Accuracy.
                    - **Languages:** English
                    """)


            # dataset 13:
            with gr.TabItem("MMLU Full"):
                with gr.TabItem("Zero Shot"):
                    with gr.TabItem("Overall"):
                        with gr.Row():
                            gr.components.Dataframe(
                                MMLU_FULL_ZERO_SHOT,
                                datatype=["number", "markdown"] + ["number"] * len(MMLU_FULL_ZERO_SHOT.columns),
                                type="pandas",
                            )
                with gr.TabItem("Five Shot"):
                    with gr.TabItem("Overall"):
                        with gr.Row():
                            gr.components.Dataframe(
                                MMLU_FULL_FIVE_SHOT,
                                datatype=["number", "markdown"] + ["number"] * len(MMLU_FULL_FIVE_SHOT.columns),
                                type="pandas",
                            )
                with gr.Row():
                    gr.Markdown("""
                    **MMLU Full Leaderboard** 🔮
                    - **Metric:** Accuracy.
                    - **Languages:** English
                    """)


            # dataset 14:
            with gr.TabItem("C_EVAL Subset"):
                with gr.TabItem("Zero Shot"):
                    with gr.TabItem("Overall"):
                        with gr.Row():
                            gr.components.Dataframe(
                                C_EVAL_ZERO_SHOT,
                                datatype=["number", "markdown"] + ["number"] * len(C_EVAL_ZERO_SHOT.columns),
                                type="pandas",
                            )
                with gr.TabItem("Five Shot"):
                    with gr.TabItem("Overall"):
                        with gr.Row():
                            gr.components.Dataframe(
                                C_EVAL_FIVE_SHOT,
                                datatype=["number", "markdown"] + ["number"] * len(C_EVAL_FIVE_SHOT.columns),
                                type="pandas",
                            )
                with gr.Row():
                    gr.Markdown("""
                    **C_EVAL Leaderboard** 🔮
                    - **Metric:** Accuracy.
                    - **Languages:** Chinese
                    """)


            # dataset 15:
            with gr.TabItem("C_EVAL Full"):
                with gr.TabItem("Zero Shot"):
                    with gr.TabItem("Overall"):
                        with gr.Row():
                            gr.components.Dataframe(
                                C_EVAL_FULL_ZERO_SHOT,
                                datatype=["number", "markdown"] + ["number"] * len(C_EVAL_FULL_ZERO_SHOT.columns),
                                type="pandas",
                            )
                with gr.TabItem("Five Shot"):
                    with gr.TabItem("Overall"):
                        with gr.Row():
                            gr.components.Dataframe(
                                C_EVAL_FULL_FIVE_SHOT,
                                datatype=["number", "markdown"] + ["number"] * len(C_EVAL_FULL_FIVE_SHOT.columns),
                                type="pandas",
                            )
                with gr.Row():
                    gr.Markdown("""
                    **C_EVAL Full Leaderboard** 🔮
                    - **Metric:** Accuracy.
                    - **Languages:** Chinese
                    """)


            # dataset 16:
            with gr.TabItem("CMMLU Subset"):
                with gr.TabItem("Zero Shot"):
                    with gr.TabItem("Overall"):
                        with gr.Row():
                            gr.components.Dataframe(
                                CMMLU_ZERO_SHOT,
                                datatype=["number", "markdown"] + ["number"] * len(CMMLU_ZERO_SHOT.columns),
                                type="pandas",
                            )
                with gr.TabItem("Five Shot"):
                    with gr.TabItem("Overall"):
                        with gr.Row():
                            gr.components.Dataframe(
                                CMMLU_FIVE_SHOT,
                                datatype=["number", "markdown"] + ["number"] * len(CMMLU_FIVE_SHOT.columns),
                                type="pandas",
                            )
                with gr.Row():
                    gr.Markdown("""
                    **CMMLU Leaderboard** 🔮
                    - **Metric:** Accuracy.
                    - **Languages:** Chinese
                    """)


            # dataset 17:
            with gr.TabItem("CMMLU Full"):
                with gr.TabItem("Zero Shot"):
                    with gr.TabItem("Overall"):
                        with gr.Row():
                            gr.components.Dataframe(
                                CMMLU_FULL_ZERO_SHOT,
                                datatype=["number", "markdown"] + ["number"] * len(CMMLU_FULL_ZERO_SHOT.columns),
                                type="pandas",
                            )
                with gr.TabItem("Five Shot"):
                    with gr.TabItem("Overall"):
                        with gr.Row():
                            gr.components.Dataframe(
                                CMMLU_FULL_FIVE_SHOT,
                                datatype=["number", "markdown"] + ["number"] * len(CMMLU_FULL_FIVE_SHOT.columns),
                                type="pandas",
                            )
                with gr.Row():
                    gr.Markdown("""
                    **CMMLU Full Leaderboard** 🔮
                    - **Metric:** Accuracy.
                    - **Languages:** Chinese
                    """)


            # dataset 18:
            with gr.TabItem("ZBench"):
                with gr.TabItem("Zero Shot"):
                    with gr.TabItem("Overall"):
                        with gr.Row():
                            gr.components.Dataframe(
                                ZBENCH_ZERO_SHOT,
                                datatype=["number", "markdown"] + ["number"] * len(ZBENCH_ZERO_SHOT.columns),
                                type="pandas",
                            )
                with gr.TabItem("Five Shot"):
                    with gr.TabItem("Overall"):
                        with gr.Row():
                            gr.components.Dataframe(
                                ZBENCH_FIVE_SHOT,
                                datatype=["number", "markdown"] + ["number"] * len(ZBENCH_FIVE_SHOT.columns),
                                type="pandas",
                            )
                with gr.Row():
                    gr.Markdown("""
                    **ZBench Leaderboard** 🔮
                    - **Metric:** Accuracy.
                    - **Languages:** Chinese
                    """)

            # dataset 18:
            with gr.TabItem("IndoMMLU"):
                with gr.TabItem("Zero Shot"):
                    with gr.TabItem("Overall"):
                        with gr.Row():
                            gr.components.Dataframe(
                                INDOMMLU_ZERO_SHOT,
                                datatype=["number", "markdown"] + ["number"] * len(INDOMMLU_ZERO_SHOT.columns),
                                type="pandas",
                            )
                with gr.TabItem("Five Shot"):
                    with gr.TabItem("Overall"):
                        with gr.Row():
                            gr.components.Dataframe(
                                INDOMMLU_FIVE_SHOT,
                                datatype=["number", "markdown"] + ["number"] * len(INDOMMLU_FIVE_SHOT.columns),
                                type="pandas",
                            )
                with gr.Row():
                    gr.Markdown("""
                    **IndoMMLU Leaderboard** 🔮
                    - **Metric:** Accuracy.
                    - **Languages:** Bahasa Indonesian
                    """)


        with gr.TabItem("FLORES-Translation"):


            # dataset 8: 
            with gr.TabItem("FLORES Indonesian to English Translation"):
                with gr.TabItem("Zero Shot"):
                    with gr.TabItem("Overall"):
                        with gr.Row():
                            gr.components.Dataframe(
                                FLORES_IND2ENG_ZERO_SHOT,
                                datatype=["number", "markdown"] + ["number"] * len(FLORES_IND2ENG_ZERO_SHOT.columns),
                                type="pandas",
                            )
                with gr.TabItem("Five Shot"):
                    with gr.TabItem("Overall"):
                        with gr.Row():
                            gr.components.Dataframe(
                                FLORES_IND2ENG_FIVE_SHOT,
                                datatype=["number", "markdown"] + ["number"] * len(FLORES_IND2ENG_FIVE_SHOT.columns),
                                type="pandas",
                            )
                with gr.Row():
                    gr.Markdown("""
                    **flores_ind2eng Leaderboard** 🔮
                    - **Metric:** BLEU Avg.
                    - **Languages:** English
                    """)


            # dataset 9: 
            with gr.TabItem("FLORES Vitenamese to English Translation"):
                with gr.TabItem("Zero Shot"):
                    with gr.TabItem("Overall"):
                        with gr.Row():
                            gr.components.Dataframe(
                                FLORES_VIE2ENG_ZERO_SHOT,
                                datatype=["number", "markdown"] + ["number"] * len(FLORES_VIE2ENG_ZERO_SHOT.columns),
                                type="pandas",
                            )
                with gr.TabItem("Five Shot"):
                    with gr.TabItem("Overall"):
                        with gr.Row():
                            gr.components.Dataframe(
                                FLORES_VIE2ENG_FIVE_SHOT,
                                datatype=["number", "markdown"] + ["number"] * len(FLORES_VIE2ENG_FIVE_SHOT.columns),
                                type="pandas",
                            )
                with gr.Row():
                    gr.Markdown("""
                    **flores_vie2eng Leaderboard** 🔮
                    - **Metric:** BLEU Avg.
                    - **Languages:** English
                    """)


            # dataset 10:
            with gr.TabItem("FLORES Chinese to English Translation"):
                with gr.TabItem("Zero Shot"):
                    with gr.TabItem("Overall"):
                        with gr.Row():
                            gr.components.Dataframe(
                                FLORES_ZHO2ENG_ZERO_SHOT,
                                datatype=["number", "markdown"] + ["number"] * len(FLORES_ZHO2ENG_ZERO_SHOT.columns),
                                type="pandas",
                            )
                with gr.TabItem("Five Shot"):
                    with gr.TabItem("Overall"):
                        with gr.Row():
                            gr.components.Dataframe(
                                FLORES_ZHO2ENG_FIVE_SHOT,
                                datatype=["number", "markdown"] + ["number"] * len(FLORES_ZHO2ENG_FIVE_SHOT.columns),
                                type="pandas",
                            )
                with gr.Row():
                    gr.Markdown("""
                    **flores_zho2eng Leaderboard** 🔮
                    - **Metric:** BLEU Avg.
                    - **Languages:** English
                    """)


            # dataset 11:
            with gr.TabItem("FLORES Malay to English Translation"):
                with gr.TabItem("Zero Shot"):
                    with gr.TabItem("Overall"):
                        with gr.Row():
                            gr.components.Dataframe(
                                FLORES_ZSM2ENG_ZERO_SHOT,
                                datatype=["number", "markdown"] + ["number"] * len(FLORES_ZSM2ENG_ZERO_SHOT.columns),
                                type="pandas",
                            )
                with gr.TabItem("Five Shot"):
                    with gr.TabItem("Overall"):
                        with gr.Row():
                            gr.components.Dataframe(
                                FLORES_ZSM2ENG_FIVE_SHOT,
                                datatype=["number", "markdown"] + ["number"] * len(FLORES_ZSM2ENG_FIVE_SHOT.columns),
                                type="pandas",
                            )
                with gr.Row():
                    gr.Markdown("""
                    **flores_zsm2eng Leaderboard** 🔮
                    - **Metric:** BLEU Avg.
                    - **Languages:** English
                    """)
                    

        with gr.TabItem("Emotion"):

            # dataset 18:
            with gr.TabItem("Indonesian Emotion Classification"):
                with gr.TabItem("Zero Shot"):
                    with gr.TabItem("Overall"):
                        with gr.Row():
                            gr.components.Dataframe(
                                IND_EMOTION_ZERO_SHOT,
                                datatype=["number", "markdown"] + ["number"] * len(IND_EMOTION_ZERO_SHOT.columns),
                                type="pandas",
                            )
                with gr.TabItem("Five Shot"):
                    with gr.TabItem("Overall"):
                        with gr.Row():
                            gr.components.Dataframe(
                                IND_EMOTION_FIVE_SHOT,
                                datatype=["number", "markdown"] + ["number"] * len(IND_EMOTION_FIVE_SHOT.columns),
                                type="pandas",
                            )
                with gr.Row():
                    gr.Markdown("""
                    **Ind_emotion Leaderboard** 🔮
                    - **Metric:** Accuracy.
                    - **Languages:** Indonesian
                    """)


            # dataset
            with gr.TabItem("SST2"):
                with gr.TabItem("Zero Shot"):
                    with gr.TabItem("Overall"):
                        with gr.Row():
                            gr.components.Dataframe(
                                SST2_ZERO_SHOT,
                                datatype=["number", "markdown"] + ["number"] * len(SST2_ZERO_SHOT.columns),
                                type="pandas",
                            )
                with gr.TabItem("Five Shot"):
                    with gr.TabItem("Overall"):
                        with gr.Row():
                            gr.components.Dataframe(
                                SST2_FIVE_SHOT,
                                datatype=["number", "markdown"] + ["number"] * len(SST2_FIVE_SHOT.columns),
                                type="pandas",
                            )
                with gr.Row():
                    gr.Markdown("""
                    **SST2 Leaderboard** 🔮
                    - **Metric:** Accuracy.
                    - **Languages:** English
                    """)


        with gr.TabItem("Dialogue"):


            # dataset
            with gr.TabItem("DREAM"):
                with gr.TabItem("Zero Shot"):
                    with gr.TabItem("Overall"):
                        with gr.Row():
                            gr.components.Dataframe(
                                DREAM_ZERO_SHOT,
                                datatype=["number", "markdown"] + ["number"] * len(DREAM_ZERO_SHOT.columns),
                                type="pandas",
                            )
                with gr.TabItem("Five Shot"):
                    with gr.TabItem("Overall"):
                        with gr.Row():
                            gr.components.Dataframe(
                                DREAM_FIVE_SHOT,
                                datatype=["number", "markdown"] + ["number"] * len(DREAM_FIVE_SHOT.columns),
                                type="pandas",
                            )
                with gr.Row():
                    gr.Markdown("""
                    **DREAM Leaderboard** 🔮
                    - **Metric:** Accuracy.
                    - **Languages:** English
                    """)

            # dataset
            with gr.TabItem("SAMSum"):
                with gr.TabItem("Zero Shot"):
                    with gr.TabItem("Overall"):
                        with gr.Row():
                            gr.components.Dataframe(
                                SAMSUM_ZERO_SHOT,
                                datatype=["number", "markdown"] + ["number"] * len(SAMSUM_ZERO_SHOT.columns),
                                type="pandas",
                            )
                with gr.TabItem("Five Shot"):
                    with gr.TabItem("Overall"):
                        with gr.Row():
                            gr.components.Dataframe(
                                SAMSUM_FIVE_SHOT,
                                datatype=["number", "markdown"] + ["number"] * len(SAMSUM_FIVE_SHOT.columns),
                                type="pandas",
                            )
                with gr.Row():
                    gr.Markdown("""
                    **SAMSum Leaderboard** 🔮
                    - **Metric:** ROUGE.
                    - **Languages:** English
                    """)


            # dataset
            with gr.TabItem("DialogSum"):
                with gr.TabItem("Zero Shot"):
                    with gr.TabItem("Overall"):
                        with gr.Row():
                            gr.components.Dataframe(
                                DIALOGSUM_ZERO_SHOT,
                                datatype=["number", "markdown"] + ["number"] * len(DIALOGSUM_ZERO_SHOT.columns),
                                type="pandas",
                            )
                with gr.TabItem("Five Shot"):
                    with gr.TabItem("Overall"):
                        with gr.Row():
                            gr.components.Dataframe(
                                DIALOGSUM_FIVE_SHOT,
                                datatype=["number", "markdown"] + ["number"] * len(DIALOGSUM_FIVE_SHOT.columns),
                                type="pandas",
                            )
                with gr.Row():
                    gr.Markdown("""
                    **DialogSum Leaderboard** 🔮
                    - **Metric:** ROUGE.
                    - **Languages:** English
                    """)


        with gr.TabItem("Fundamental NLP Tasks"):


            # dataset
            with gr.TabItem("OCNLI"):
                with gr.TabItem("Zero Shot"):
                    with gr.TabItem("Overall"):
                        with gr.Row():
                            gr.components.Dataframe(
                                OCNLI_ZERO_SHOT,
                                datatype=["number", "markdown"] + ["number"] * len(OCNLI_ZERO_SHOT.columns),
                                type="pandas",
                            )
                with gr.TabItem("Five Shot"):
                    with gr.TabItem("Overall"):
                        with gr.Row():
                            gr.components.Dataframe(
                                OCNLI_FIVE_SHOT,
                                datatype=["number", "markdown"] + ["number"] * len(OCNLI_FIVE_SHOT.columns),
                                type="pandas",
                            )
                with gr.Row():
                    gr.Markdown("""
                    **OCNLI Leaderboard** 🔮
                    - **Metric:** Accuracy.
                    - **Languages:** Chinese
                    """)


            # dataset
            with gr.TabItem("C3"):
                with gr.TabItem("Zero Shot"):
                    with gr.TabItem("Overall"):
                        with gr.Row():
                            gr.components.Dataframe(
                                C3_ZERO_SHOT,
                                datatype=["number", "markdown"] + ["number"] * len(C3_ZERO_SHOT.columns),
                                type="pandas",
                            )
                with gr.TabItem("Five Shot"):
                    with gr.TabItem("Overall"):
                        with gr.Row():
                            gr.components.Dataframe(
                                C3_FIVE_SHOT,
                                datatype=["number", "markdown"] + ["number"] * len(C3_FIVE_SHOT.columns),
                                type="pandas",
                            )
                with gr.Row():
                    gr.Markdown("""
                    **C3 Leaderboard** 🔮
                    - **Metric:** Accuracy.
                    - **Languages:** Chinese
                    """)


            # dataset
            with gr.TabItem("COLA"):
                with gr.TabItem("Zero Shot"):
                    with gr.TabItem("Overall"):
                        with gr.Row():
                            gr.components.Dataframe(
                                COLA_ZERO_SHOT,
                                datatype=["number", "markdown"] + ["number"] * len(COLA_ZERO_SHOT.columns),
                                type="pandas",
                            )
                with gr.TabItem("Five Shot"):
                    with gr.TabItem("Overall"):
                        with gr.Row():
                            gr.components.Dataframe(
                                COLA_FIVE_SHOT,
                                datatype=["number", "markdown"] + ["number"] * len(COLA_FIVE_SHOT.columns),
                                type="pandas",
                            )
                with gr.Row():
                    gr.Markdown("""
                    **COLA Leaderboard** 🔮
                    - **Metric:** Accuracy.
                    - **Languages:** English
                    """)


            # dataset
            with gr.TabItem("QQP"):
                with gr.TabItem("Zero Shot"):
                    with gr.TabItem("Overall"):
                        with gr.Row():
                            gr.components.Dataframe(
                                QQP_ZERO_SHOT,
                                datatype=["number", "markdown"] + ["number"] * len(QQP_ZERO_SHOT.columns),
                                type="pandas",
                            )
                with gr.TabItem("Five Shot"):
                    with gr.TabItem("Overall"):
                        with gr.Row():
                            gr.components.Dataframe(
                                QQP_FIVE_SHOT,
                                datatype=["number", "markdown"] + ["number"] * len(QQP_FIVE_SHOT.columns),
                                type="pandas",
                            )
                with gr.Row():
                    gr.Markdown("""
                    **QQP Leaderboard** 🔮
                    - **Metric:** Accuracy.
                    - **Languages:** English
                    """)


            # dataset
            with gr.TabItem("MNLI"):
                with gr.TabItem("Zero Shot"):
                    with gr.TabItem("Overall"):
                        with gr.Row():
                            gr.components.Dataframe(
                                MNLI_ZERO_SHOT,
                                datatype=["number", "markdown"] + ["number"] * len(MNLI_ZERO_SHOT.columns),
                                type="pandas",
                            )
                with gr.TabItem("Five Shot"):
                    with gr.TabItem("Overall"):
                        with gr.Row():
                            gr.components.Dataframe(
                                MNLI_FIVE_SHOT,
                                datatype=["number", "markdown"] + ["number"] * len(MNLI_FIVE_SHOT.columns),
                                type="pandas",
                            )
                with gr.Row():
                    gr.Markdown("""
                    **MNLI Leaderboard** 🔮
                    - **Metric:** Accuracy.
                    - **Languages:** English
                    """)


            # dataset
            with gr.TabItem("QNLI"):
                with gr.TabItem("Zero Shot"):
                    with gr.TabItem("Overall"):
                        with gr.Row():
                            gr.components.Dataframe(
                                QNLI_ZERO_SHOT,
                                datatype=["number", "markdown"] + ["number"] * len(QNLI_ZERO_SHOT.columns),
                                type="pandas",
                            )
                with gr.TabItem("Five Shot"):
                    with gr.TabItem("Overall"):
                        with gr.Row():
                            gr.components.Dataframe(
                                QNLI_FIVE_SHOT,
                                datatype=["number", "markdown"] + ["number"] * len(QNLI_FIVE_SHOT.columns),
                                type="pandas",
                            )
                with gr.Row():
                    gr.Markdown("""
                    **QNLI Leaderboard** 🔮
                    - **Metric:** Accuracy.
                    - **Languages:** English
                    """)


            # dataset
            with gr.TabItem("WNLI"):
                with gr.TabItem("Zero Shot"):
                    with gr.TabItem("Overall"):
                        with gr.Row():
                            gr.components.Dataframe(
                                WNLI_ZERO_SHOT,
                                datatype=["number", "markdown"] + ["number"] * len(WNLI_ZERO_SHOT.columns),
                                type="pandas",
                            )
                with gr.TabItem("Five Shot"):
                    with gr.TabItem("Overall"):
                        with gr.Row():
                            gr.components.Dataframe(
                                WNLI_FIVE_SHOT,
                                datatype=["number", "markdown"] + ["number"] * len(WNLI_FIVE_SHOT.columns),
                                type="pandas",
                            )
                with gr.Row():
                    gr.Markdown("""
                    **WNLI Leaderboard** 🔮
                    - **Metric:** Accuracy.
                    - **Languages:** English
                    """)


            # dataset
            with gr.TabItem("RTE"):
                with gr.TabItem("Zero Shot"):
                    with gr.TabItem("Overall"):
                        with gr.Row():
                            gr.components.Dataframe(
                                RTE_ZERO_SHOT,
                                datatype=["number", "markdown"] + ["number"] * len(RTE_ZERO_SHOT.columns),
                                type="pandas",
                            )
                with gr.TabItem("Five Shot"):
                    with gr.TabItem("Overall"):
                        with gr.Row():
                            gr.components.Dataframe(
                                RTE_FIVE_SHOT,
                                datatype=["number", "markdown"] + ["number"] * len(RTE_FIVE_SHOT.columns),
                                type="pandas",
                            )
                with gr.Row():
                    gr.Markdown("""
                    **RTE Leaderboard** 🔮
                    - **Metric:** Accuracy.
                    - **Languages:** English
                    """)

            # dataset
            with gr.TabItem("MRPC"):
                with gr.TabItem("Zero Shot"):
                    with gr.TabItem("Overall"):
                        with gr.Row():
                            gr.components.Dataframe(
                                MRPC_ZERO_SHOT,
                                datatype=["number", "markdown"] + ["number"] * len(MRPC_ZERO_SHOT.columns),
                                type="pandas",
                            )
                with gr.TabItem("Five Shot"):
                    with gr.TabItem("Overall"):
                        with gr.Row():
                            gr.components.Dataframe(
                                MRPC_FIVE_SHOT,
                                datatype=["number", "markdown"] + ["number"] * len(MRPC_FIVE_SHOT.columns),
                                type="pandas",
                            )
                with gr.Row():
                    gr.Markdown("""
                    **MRPC Leaderboard** 🔮
                    - **Metric:** Accuracy.
                    - **Languages:** English
                    """)

    gr.Markdown(r"""
    ### If our datasets and leaderboard are useful, please consider cite:
    ```bibtex
        @article{SeaEval,
        title={SeaEval for Multilingual Foundation Models: From Cross-Lingual Alignment to Cultural Reasoning},
        author={Wang, Bin and Liu, Zhengyuan and Huang, Xin and Jiao, Fangkai and Ding, Yang and Aw, Ai Ti and Chen, Nancy F.},
        journal={NAACL},
        year={2024}}
    ```
    """)


block.queue(max_size=10)
# block.launch(server_name="0.0.0.0", share=False)
block.launch(server_name="0.0.0.0", share=True)