import json import gradio as gr import pandas as pd from statistics import median print("Loading datasets...") # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = def add_rank(df, compute_average=True): cols_to_rank = [col for col in df.columns if col not in ["Model", "Model Size (Params)", "Embedding Dimensions", "Sequence Length"]] if len(cols_to_rank) == 1: df.sort_values(cols_to_rank[0], ascending=False, inplace=True) else: if compute_average: df.insert(1, "Average", df[cols_to_rank].mean(axis=1, skipna=False)) df.sort_values("Average", ascending=False, inplace=True) else: df.sort_values(cols_to_rank[0], ascending=False, inplace=True) df.insert(0, "Rank", list(range(1, len(df) + 1))) df = df.round(2) # Fill NaN after averaging df.fillna("", inplace=True) return df def make_clickable_model(model_name, link=None): if link is None: link = "https://huggingface.co/" + model_name # Remove user from model name return ( f'{model_name.split("/")[-1]}' ) # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = with open('all_results.json', 'r') as f: ALL_RESULTS = json.load(f) MODEL_LIST = list(ALL_RESULTS.keys()) NUM_MODELS = len(set(MODEL_LIST)) MODEL_TO_SIZE = {model: ALL_RESULTS[model]["model_size"] for model in MODEL_LIST} # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = def get_data_cross_xquad_overall(eval_mode='zero_shot', fillna=True, rank=True): df_list = [] for model in MODEL_LIST: try: results_list = [ALL_RESULTS[model][eval_mode]['cross_xquad'][res] for res in ALL_RESULTS[model][eval_mode]['cross_xquad']] overall_acc = [results['overall_acc'] for results in results_list] overall_acc = median(overall_acc) consistency_score_3 = [results['consistency_score_3'] for results in results_list] consistency_score_3 = median(consistency_score_3) AC3_3 = [results['AC3_3'] for results in results_list] AC3_3 = median(AC3_3) res = { "Model Size (Params)": MODEL_TO_SIZE.get(model, ""), "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), "Accuracy": overall_acc, "Cross-Lingual Consistency": consistency_score_3, "AC3": AC3_3, } df_list.append(res) except: print('Not found in model: {} for {}'.format(model, "cross_xquad_overall")) df = pd.DataFrame(df_list) # If there are any models that are the same, merge them # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one df = df.groupby("Model", as_index=False).first() # Put 'Model' column first #cols = sorted(list(df.columns)) cols = list(df.columns) cols.insert(0, cols.pop(cols.index("Model"))) df = df[cols] if rank: df = add_rank(df, compute_average=False) if fillna: df.fillna("", inplace=True) return df CROSS_XQUAD_ZERO_SHOT_OVERALL = get_data_cross_xquad_overall(eval_mode="zero_shot") CROSS_XQUAD_FIVE_SHOT_OVERALL = get_data_cross_xquad_overall(eval_mode="five_shot") def get_data_cross_xquad_language(eval_mode='zero_shot', fillna=True, rank=True): df_list = [] for model in MODEL_LIST: try: results_list = [ALL_RESULTS[model][eval_mode]['cross_xquad'][res] for res in ALL_RESULTS[model][eval_mode]['cross_xquad']] English = [results['language_acc']['English'] for results in results_list] Vietnamese = [results['language_acc']['Vietnamese'] for results in results_list] Chinese = [results['language_acc']['Chinese'] for results in results_list] Spanish = [results['language_acc']['Spanish'] for results in results_list] English = median(English) Vietnamese = median(Vietnamese) Chinese = median(Chinese) Spanish = median(Spanish) res = { "Model Size (Params)": MODEL_TO_SIZE.get(model, ""), "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), "English": English, "Vietnamese": Vietnamese, "Chinese": Chinese, "Spanish": Spanish, } df_list.append(res) except: print('Not found in model: {} for {}'.format(model, "cross_xquad_lang")) df = pd.DataFrame(df_list) # If there are any models that are the same, merge them # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one df = df.groupby("Model", as_index=False).first() # Put 'Model' column first #cols = sorted(list(df.columns)) cols = list(df.columns) cols.insert(0, cols.pop(cols.index("Model"))) df = df[cols] if rank: df = add_rank(df, compute_average=False) if fillna: df.fillna("", inplace=True) return df CROSS_XQUAD_ZERO_SHOT_LANGUAGE = get_data_cross_xquad_language(eval_mode="zero_shot") CROSS_XQUAD_FIVE_SHOT_LANGUAGE = get_data_cross_xquad_language(eval_mode="five_shot") # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = def get_data_cross_mmlu_overall(eval_mode='zero_shot', fillna=True, rank=True): df_list = [] for model in MODEL_LIST: try: results_list = [ALL_RESULTS[model][eval_mode]['cross_mmlu'][res] for res in ALL_RESULTS[model][eval_mode]['cross_mmlu']] overall_acc = [results['overall_acc'] for results in results_list] overall_acc = median(overall_acc) consistency_score_3 = [results['consistency_score_3'] for results in results_list] consistency_score_3 = median(consistency_score_3) AC3_3 = [results['AC3_3'] for results in results_list] AC3_3 = median(AC3_3) res = { "Model Size (Params)": MODEL_TO_SIZE.get(model, ""), "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), "Accuracy": overall_acc, "Cross-Lingual Consistency": consistency_score_3, "AC3": AC3_3, } df_list.append(res) except: print('Not found in model: {} for {}'.format(model, "cross_mmlu_overall")) df = pd.DataFrame(df_list) # If there are any models that are the same, merge them # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one df = df.groupby("Model", as_index=False).first() # Put 'Model' column first #cols = sorted(list(df.columns)) cols = list(df.columns) cols.insert(0, cols.pop(cols.index("Model"))) df = df[cols] if rank: df = add_rank(df, compute_average=False) if fillna: df.fillna("", inplace=True) return df CROSS_MMLU_ZERO_SHOT_OVERALL = get_data_cross_mmlu_overall(eval_mode="zero_shot") CROSS_MMLU_FIVE_SHOT_OVERALL = get_data_cross_mmlu_overall(eval_mode="five_shot") def get_data_cross_mmlu_language(eval_mode='zero_shot', fillna=True, rank=True): df_list = [] for model in MODEL_LIST: try: results_list = [ALL_RESULTS[model][eval_mode]['cross_mmlu'][res] for res in ALL_RESULTS[model][eval_mode]['cross_mmlu']] English = [results['language_acc']['English'] for results in results_list] Vietnamese = [results['language_acc']['Vietnamese'] for results in results_list] Chinese = [results['language_acc']['Chinese'] for results in results_list] Indonesian = [results['language_acc']['Indonesian'] for results in results_list] Filipino = [results['language_acc']['Filipino'] for results in results_list] Spanish = [results['language_acc']['Spanish'] for results in results_list] Malay = [results['language_acc']['Malay'] for results in results_list] English = median(English) Vietnamese = median(Vietnamese) Chinese = median(Chinese) Indonesian = median(Indonesian) Filipino = median(Filipino) Spanish = median(Spanish) Malay = median(Malay) res = { "Model Size (Params)": MODEL_TO_SIZE.get(model, ""), "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), "English": English, "Vietnamese": Vietnamese, "Chinese": Chinese, "Indonesian": Indonesian, "Filipino": Filipino, "Spanish": Spanish, "Malay": Malay, } df_list.append(res) except: print('Not found in model: {} for {}'.format(model, "cross_mmlu_lang")) df = pd.DataFrame(df_list) # If there are any models that are the same, merge them # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one df = df.groupby("Model", as_index=False).first() # Put 'Model' column first #cols = sorted(list(df.columns)) cols = list(df.columns) cols.insert(0, cols.pop(cols.index("Model"))) df = df[cols] if rank: df = add_rank(df, compute_average=False) if fillna: df.fillna("", inplace=True) return df CROSS_MMLU_ZERO_SHOT_LANGUAGE = get_data_cross_mmlu_language(eval_mode="zero_shot") CROSS_MMLU_FIVE_SHOT_LANGUAGE = get_data_cross_mmlu_language(eval_mode="five_shot") # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = def get_data_cross_logiqa_overall(eval_mode='zero_shot', fillna=True, rank=True): df_list = [] for model in MODEL_LIST: try: results_list = [ALL_RESULTS[model][eval_mode]['cross_logiqa'][res] for res in ALL_RESULTS[model][eval_mode]['cross_logiqa']] overall_acc = [results['overall_acc'] for results in results_list] overall_acc = median(overall_acc) consistency_score_3 = [results['consistency_score_3'] for results in results_list] consistency_score_3 = median(consistency_score_3) AC3_3 = [results['AC3_3'] for results in results_list] AC3_3 = median(AC3_3) res = { "Model Size (Params)": MODEL_TO_SIZE.get(model, ""), "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), "Accuracy": overall_acc, "Cross-Lingual Consistency": consistency_score_3, "AC3": AC3_3, } df_list.append(res) except: print('Not found in model: {} for {}'.format(model, "cross_logiqa_overall")) df = pd.DataFrame(df_list) # If there are any models that are the same, merge them # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one df = df.groupby("Model", as_index=False).first() # Put 'Model' column first #cols = sorted(list(df.columns)) cols = list(df.columns) cols.insert(0, cols.pop(cols.index("Model"))) df = df[cols] if rank: df = add_rank(df, compute_average=False) if fillna: df.fillna("", inplace=True) return df CROSS_LOGIQA_ZERO_SHOT_OVERALL = get_data_cross_logiqa_overall(eval_mode="zero_shot") CROSS_LOGIQA_FIVE_SHOT_OVERALL = get_data_cross_logiqa_overall(eval_mode="five_shot") def get_data_cross_logiqa_language(eval_mode='zero_shot', fillna=True, rank=True): df_list = [] for model in MODEL_LIST: try: results_list = [ALL_RESULTS[model][eval_mode]['cross_logiqa'][res] for res in ALL_RESULTS[model][eval_mode]['cross_logiqa']] English = [results['language_acc']['English'] for results in results_list] Vietnamese = [results['language_acc']['Vietnamese'] for results in results_list] Chinese = [results['language_acc']['Chinese'] for results in results_list] Indonesian = [results['language_acc']['Indonesian'] for results in results_list] Filipino = [results['language_acc']['Filipino'] for results in results_list] Spanish = [results['language_acc']['Spanish'] for results in results_list] Malay = [results['language_acc']['Malay'] for results in results_list] English = median(English) Vietnamese = median(Vietnamese) Chinese = median(Chinese) Indonesian = median(Indonesian) Filipino = median(Filipino) Spanish = median(Spanish) Malay = median(Malay) res = { "Model Size (Params)": MODEL_TO_SIZE.get(model, ""), "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), "English": English, "Vietnamese": Vietnamese, "Chinese": Chinese, "Indonesian": Indonesian, "Filipino": Filipino, "Spanish": Spanish, "Malay": Malay, } df_list.append(res) except: print('Not found in model: {} for {}'.format(model, "cross_logiqa_language")) df = pd.DataFrame(df_list) # If there are any models that are the same, merge them # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one df = df.groupby("Model", as_index=False).first() # Put 'Model' column first #cols = sorted(list(df.columns)) cols = list(df.columns) cols.insert(0, cols.pop(cols.index("Model"))) df = df[cols] if rank: df = add_rank(df, compute_average=False) if fillna: df.fillna("", inplace=True) return df CROSS_LOGIQA_ZERO_SHOT_LANGUAGE = get_data_cross_logiqa_language(eval_mode="zero_shot") CROSS_LOGIQA_FIVE_SHOT_LANGUAGE = get_data_cross_logiqa_language(eval_mode="five_shot") # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = def get_data_sg_eval(eval_mode='zero_shot', fillna=True, rank=True): df_list = [] for model in MODEL_LIST: try: results_list = [ALL_RESULTS[model][eval_mode]['sg_eval'][res] for res in ALL_RESULTS[model][eval_mode]['sg_eval']] accuracy = median([results['accuracy'] for results in results_list]) res = { "Model Size (Params)": MODEL_TO_SIZE.get(model, ""), "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), "Accuracy": accuracy, } df_list.append(res) except: print('Not found in model: {} for {}'.format(model, "sg_eval")) df = pd.DataFrame(df_list) # If there are any models that are the same, merge them # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one df = df.groupby("Model", as_index=False).first() # Put 'Model' column first #cols = sorted(list(df.columns)) cols = list(df.columns) cols.insert(0, cols.pop(cols.index("Model"))) df = df[cols] if rank: df = add_rank(df, compute_average=True) if fillna: df.fillna("", inplace=True) return df SG_EVAL_ZERO_SHOT = get_data_sg_eval(eval_mode="zero_shot") SG_EVAL_FIVE_SHOT = get_data_sg_eval(eval_mode="five_shot") # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = def get_data_us_eval(eval_mode='zero_shot', fillna=True, rank=True): df_list = [] for model in MODEL_LIST: try: results_list = [ALL_RESULTS[model][eval_mode]['us_eval'][res] for res in ALL_RESULTS[model][eval_mode]['us_eval']] accuracy = median([results['accuracy'] for results in results_list]) res = { "Model Size (Params)": MODEL_TO_SIZE.get(model, ""), "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), "Accuracy": accuracy, } df_list.append(res) except: print('Not found in model: {} for {}'.format(model, "us_eval")) df = pd.DataFrame(df_list) # If there are any models that are the same, merge them # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one df = df.groupby("Model", as_index=False).first() # Put 'Model' column first #cols = sorted(list(df.columns)) cols = list(df.columns) cols.insert(0, cols.pop(cols.index("Model"))) df = df[cols] if rank: df = add_rank(df, compute_average=True) if fillna: df.fillna("", inplace=True) return df US_EVAL_ZERO_SHOT = get_data_us_eval(eval_mode="zero_shot") US_EVAL_FIVE_SHOT = get_data_us_eval(eval_mode="five_shot") # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = def get_data_cn_eval(eval_mode='zero_shot', fillna=True, rank=True): df_list = [] for model in MODEL_LIST: try: results_list = [ALL_RESULTS[model][eval_mode]['cn_eval'][res] for res in ALL_RESULTS[model][eval_mode]['cn_eval']] accuracy = median([results['accuracy'] for results in results_list]) res = { "Model Size (Params)": MODEL_TO_SIZE.get(model, ""), "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), "Accuracy": accuracy, } df_list.append(res) except: print('Not found in model: {} for {}'.format(model, "cn_eval")) df = pd.DataFrame(df_list) # If there are any models that are the same, merge them # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one df = df.groupby("Model", as_index=False).first() # Put 'Model' column first #cols = sorted(list(df.columns)) cols = list(df.columns) cols.insert(0, cols.pop(cols.index("Model"))) df = df[cols] if rank: df = add_rank(df, compute_average=True) if fillna: df.fillna("", inplace=True) return df CN_EVAL_ZERO_SHOT = get_data_cn_eval(eval_mode="zero_shot") CN_EVAL_FIVE_SHOT = get_data_cn_eval(eval_mode="five_shot") # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = def get_data_ph_eval(eval_mode='zero_shot', fillna=True, rank=True): df_list = [] for model in MODEL_LIST: try: results_list = [ALL_RESULTS[model][eval_mode]['ph_eval'][res] for res in ALL_RESULTS[model][eval_mode]['ph_eval']] accuracy = median([results['accuracy'] for results in results_list]) res = { "Model Size (Params)": MODEL_TO_SIZE.get(model, ""), "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), "Accuracy": accuracy, } df_list.append(res) except: print('Not found in model: {} for {}'.format(model, "ph_eval")) df = pd.DataFrame(df_list) # If there are any models that are the same, merge them # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one df = df.groupby("Model", as_index=False).first() # Put 'Model' column first #cols = sorted(list(df.columns)) cols = list(df.columns) cols.insert(0, cols.pop(cols.index("Model"))) df = df[cols] if rank: df = add_rank(df, compute_average=True) if fillna: df.fillna("", inplace=True) return df PH_EVAL_ZERO_SHOT = get_data_ph_eval(eval_mode="zero_shot") PH_EVAL_FIVE_SHOT = get_data_ph_eval(eval_mode="five_shot") # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = def get_data_sing2eng(eval_mode='zero_shot', fillna=True, rank=True): df_list = [] for model in MODEL_LIST: try: results_list = [ALL_RESULTS[model][eval_mode]['sing2eng'][res] for res in ALL_RESULTS[model][eval_mode]['sing2eng']] bleu_score = median([results['bleu_score'] for results in results_list]) res = { "Model Size (Params)": MODEL_TO_SIZE.get(model, ""), "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), "BLEU": bleu_score, } df_list.append(res) except: print('Not found in model: {} for {}'.format(model, "sing2eng")) df = pd.DataFrame(df_list) # If there are any models that are the same, merge them # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one df = df.groupby("Model", as_index=False).first() # Put 'Model' column first #cols = sorted(list(df.columns)) cols = list(df.columns) cols.insert(0, cols.pop(cols.index("Model"))) df = df[cols] if rank: df = add_rank(df, compute_average=True) if fillna: df.fillna("", inplace=True) return df SING2ENG_ZERO_SHOT = get_data_sing2eng(eval_mode="zero_shot") SING2ENG_FIVE_SHOT = get_data_sing2eng(eval_mode="five_shot") # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = def get_data_flores_ind2eng(eval_mode='zero_shot', fillna=True, rank=True): df_list = [] for model in MODEL_LIST: try: results_list = [ALL_RESULTS[model][eval_mode]['flores_ind2eng'][res] for res in ALL_RESULTS[model][eval_mode]['flores_ind2eng']] bleu_score = median([results['bleu_score'] for results in results_list]) res = { "Model Size (Params)": MODEL_TO_SIZE.get(model, ""), "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), "BLEU": bleu_score, } df_list.append(res) except: print('Not found in model: {} for {}'.format(model, "flores_ind2eng")) df = pd.DataFrame(df_list) # If there are any models that are the same, merge them # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one df = df.groupby("Model", as_index=False).first() # Put 'Model' column first #cols = sorted(list(df.columns)) cols = list(df.columns) cols.insert(0, cols.pop(cols.index("Model"))) df = df[cols] if rank: df = add_rank(df, compute_average=True) if fillna: df.fillna("", inplace=True) return df FLORES_IND2ENG_ZERO_SHOT = get_data_flores_ind2eng(eval_mode="zero_shot") FLORES_IND2ENG_FIVE_SHOT = get_data_flores_ind2eng(eval_mode="five_shot") # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = def get_data_flores_vie2eng(eval_mode='zero_shot', fillna=True, rank=True): df_list = [] for model in MODEL_LIST: try: results_list = [ALL_RESULTS[model][eval_mode]['flores_vie2eng'][res] for res in ALL_RESULTS[model][eval_mode]['flores_vie2eng']] bleu_score = median([results['bleu_score'] for results in results_list]) res = { "Model Size (Params)": MODEL_TO_SIZE.get(model, ""), "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), "BLEU": bleu_score, } df_list.append(res) except: print('Not found in model: {} for {}'.format(model, "flores_vie2eng")) df = pd.DataFrame(df_list) # If there are any models that are the same, merge them # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one df = df.groupby("Model", as_index=False).first() # Put 'Model' column first #cols = sorted(list(df.columns)) cols = list(df.columns) cols.insert(0, cols.pop(cols.index("Model"))) df = df[cols] if rank: df = add_rank(df, compute_average=True) if fillna: df.fillna("", inplace=True) return df FLORES_VIE2ENG_ZERO_SHOT = get_data_flores_vie2eng(eval_mode="zero_shot") FLORES_VIE2ENG_FIVE_SHOT = get_data_flores_vie2eng(eval_mode="five_shot") # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = def get_data_flores_zho2eng(eval_mode='zero_shot', fillna=True, rank=True): df_list = [] for model in MODEL_LIST: try: results_list = [ALL_RESULTS[model][eval_mode]['flores_zho2eng'][res] for res in ALL_RESULTS[model][eval_mode]['flores_zho2eng']] bleu_score = median([results['bleu_score'] for results in results_list]) res = { "Model Size (Params)": MODEL_TO_SIZE.get(model, ""), "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), "BLEU": bleu_score, } df_list.append(res) except: print('Not found in model: {} for {}'.format(model, "flores_zho2eng")) df = pd.DataFrame(df_list) # If there are any models that are the same, merge them # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one df = df.groupby("Model", as_index=False).first() # Put 'Model' column first #cols = sorted(list(df.columns)) cols = list(df.columns) cols.insert(0, cols.pop(cols.index("Model"))) df = df[cols] if rank: df = add_rank(df, compute_average=True) if fillna: df.fillna("", inplace=True) return df FLORES_ZHO2ENG_ZERO_SHOT = get_data_flores_zho2eng(eval_mode="zero_shot") FLORES_ZHO2ENG_FIVE_SHOT = get_data_flores_zho2eng(eval_mode="five_shot") # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = def get_data_flores_zsm2eng(eval_mode='zero_shot', fillna=True, rank=True): df_list = [] for model in MODEL_LIST: try: results_list = [ALL_RESULTS[model][eval_mode]['flores_zsm2eng'][res] for res in ALL_RESULTS[model][eval_mode]['flores_zsm2eng']] bleu_score = median([results['bleu_score'] for results in results_list]) res = { "Model Size (Params)": MODEL_TO_SIZE.get(model, ""), "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), "BLEU": bleu_score, } df_list.append(res) except: print('Not found in model: {} for {}'.format(model, "flores_zsm2eng")) df = pd.DataFrame(df_list) # If there are any models that are the same, merge them # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one df = df.groupby("Model", as_index=False).first() # Put 'Model' column first #cols = sorted(list(df.columns)) cols = list(df.columns) cols.insert(0, cols.pop(cols.index("Model"))) df = df[cols] if rank: df = add_rank(df, compute_average=True) if fillna: df.fillna("", inplace=True) return df FLORES_ZSM2ENG_ZERO_SHOT = get_data_flores_zho2eng(eval_mode="zero_shot") FLORES_ZSM2ENG_FIVE_SHOT = get_data_flores_zho2eng(eval_mode="five_shot") # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = def get_data_mmlu(eval_mode='zero_shot', fillna=True, rank=True): df_list = [] for model in MODEL_LIST: try: results_list = [ALL_RESULTS[model][eval_mode]['mmlu'][res] for res in ALL_RESULTS[model][eval_mode]['mmlu']] accuracy = median([results['accuracy'] for results in results_list]) res = { "Model Size (Params)": MODEL_TO_SIZE.get(model, ""), "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), "Accuracy": accuracy, } df_list.append(res) except: accuracy = -1 df = pd.DataFrame(df_list) # If there are any models that are the same, merge them # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one df = df.groupby("Model", as_index=False).first() # Put 'Model' column first #cols = sorted(list(df.columns)) cols = list(df.columns) cols.insert(0, cols.pop(cols.index("Model"))) df = df[cols] if rank: df = add_rank(df, compute_average=True) if fillna: df.fillna("", inplace=True) return df MMLU_ZERO_SHOT = get_data_mmlu(eval_mode="zero_shot") MMLU_FIVE_SHOT = get_data_mmlu(eval_mode="five_shot") # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = def get_data_mmlu_full(eval_mode='zero_shot', fillna=True, rank=True): df_list = [] for model in MODEL_LIST: try: results_list = [ALL_RESULTS[model][eval_mode]['mmlu_full'][res] for res in ALL_RESULTS[model][eval_mode]['mmlu_full']] accuracy = median([results['accuracy'] for results in results_list]) res = { "Model Size (Params)": MODEL_TO_SIZE.get(model, ""), "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), "Accuracy": accuracy, } df_list.append(res) except: print('Not found in model: {} for {}'.format(model, "mmlu_full")) df = pd.DataFrame(df_list) # If there are any models that are the same, merge them # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one df = df.groupby("Model", as_index=False).first() # Put 'Model' column first #cols = sorted(list(df.columns)) cols = list(df.columns) cols.insert(0, cols.pop(cols.index("Model"))) df = df[cols] if rank: df = add_rank(df, compute_average=True) if fillna: df.fillna("", inplace=True) return df MMLU_FULL_ZERO_SHOT = get_data_mmlu_full(eval_mode="zero_shot") MMLU_FULL_FIVE_SHOT = get_data_mmlu_full(eval_mode="five_shot") # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = def get_data_c_eval(eval_mode='zero_shot', fillna=True, rank=True): df_list = [] for model in MODEL_LIST: try: results_list = [ALL_RESULTS[model][eval_mode]['c_eval'][res] for res in ALL_RESULTS[model][eval_mode]['c_eval']] accuracy = median([results['accuracy'] for results in results_list]) res = { "Model Size (Params)": MODEL_TO_SIZE.get(model, ""), "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), "Accuracy": accuracy, } df_list.append(res) except: print('Not found in model: {} for {}'.format(model, "c_eval")) df = pd.DataFrame(df_list) # If there are any models that are the same, merge them # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one df = df.groupby("Model", as_index=False).first() # Put 'Model' column first #cols = sorted(list(df.columns)) cols = list(df.columns) cols.insert(0, cols.pop(cols.index("Model"))) df = df[cols] if rank: df = add_rank(df, compute_average=True) if fillna: df.fillna("", inplace=True) return df C_EVAL_ZERO_SHOT = get_data_c_eval(eval_mode="zero_shot") C_EVAL_FIVE_SHOT = get_data_c_eval(eval_mode="five_shot") # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = def get_data_c_eval_full(eval_mode='zero_shot', fillna=True, rank=True): df_list = [] for model in MODEL_LIST: try: results_list = [ALL_RESULTS[model][eval_mode]['c_eval_full'][res] for res in ALL_RESULTS[model][eval_mode]['c_eval_full']] accuracy = median([results['accuracy'] for results in results_list]) res = { "Model Size (Params)": MODEL_TO_SIZE.get(model, ""), "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), "Accuracy": accuracy, } df_list.append(res) except: print('Not found in model: {} for {}'.format(model, "c_eval_full")) df = pd.DataFrame(df_list) # If there are any models that are the same, merge them # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one df = df.groupby("Model", as_index=False).first() # Put 'Model' column first #cols = sorted(list(df.columns)) cols = list(df.columns) cols.insert(0, cols.pop(cols.index("Model"))) df = df[cols] if rank: df = add_rank(df, compute_average=True) if fillna: df.fillna("", inplace=True) return df C_EVAL_FULL_ZERO_SHOT = get_data_c_eval_full(eval_mode="zero_shot") C_EVAL_FULL_FIVE_SHOT = get_data_c_eval_full(eval_mode="five_shot") # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = def get_data_cmmlu(eval_mode='zero_shot', fillna=True, rank=True): df_list = [] for model in MODEL_LIST: try: results_list = [ALL_RESULTS[model][eval_mode]['cmmlu'][res] for res in ALL_RESULTS[model][eval_mode]['cmmlu']] accuracy = median([results['accuracy'] for results in results_list]) res = { "Model Size (Params)": MODEL_TO_SIZE.get(model, ""), "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), "Accuracy": accuracy, } df_list.append(res) except: print('Not found in model: {} for {}'.format(model, "cmmlu")) df = pd.DataFrame(df_list) # If there are any models that are the same, merge them # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one df = df.groupby("Model", as_index=False).first() # Put 'Model' column first #cols = sorted(list(df.columns)) cols = list(df.columns) cols.insert(0, cols.pop(cols.index("Model"))) df = df[cols] if rank: df = add_rank(df, compute_average=True) if fillna: df.fillna("", inplace=True) return df CMMLU_ZERO_SHOT = get_data_cmmlu(eval_mode="zero_shot") CMMLU_FIVE_SHOT = get_data_cmmlu(eval_mode="five_shot") # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = def get_data_cmmlu_full(eval_mode='zero_shot', fillna=True, rank=True): df_list = [] for model in MODEL_LIST: try: results_list = [ALL_RESULTS[model][eval_mode]['cmmlu_full'][res] for res in ALL_RESULTS[model][eval_mode]['cmmlu_full']] accuracy = median([results['accuracy'] for results in results_list]) res = { "Model Size (Params)": MODEL_TO_SIZE.get(model, ""), "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), "Accuracy": accuracy, } df_list.append(res) except: print('Not found in model: {} for {}'.format(model, "cmmlu_full")) df = pd.DataFrame(df_list) # If there are any models that are the same, merge them # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one df = df.groupby("Model", as_index=False).first() # Put 'Model' column first #cols = sorted(list(df.columns)) cols = list(df.columns) cols.insert(0, cols.pop(cols.index("Model"))) df = df[cols] if rank: df = add_rank(df, compute_average=True) if fillna: df.fillna("", inplace=True) return df CMMLU_FULL_ZERO_SHOT = get_data_cmmlu_full(eval_mode="zero_shot") CMMLU_FULL_FIVE_SHOT = get_data_cmmlu_full(eval_mode="five_shot") # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = def get_data_zbench(eval_mode='zero_shot', fillna=True, rank=True): df_list = [] for model in MODEL_LIST: try: results_list = [ALL_RESULTS[model][eval_mode]['zbench'][res] for res in ALL_RESULTS[model][eval_mode]['zbench']] accuracy = median([results['accuracy'] for results in results_list]) res = { "Model Size (Params)": MODEL_TO_SIZE.get(model, ""), "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), "Accuracy": accuracy, } df_list.append(res) except: print('Not found in model: {} for {}'.format(model, "zbench")) df = pd.DataFrame(df_list) # If there are any models that are the same, merge them # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one df = df.groupby("Model", as_index=False).first() # Put 'Model' column first #cols = sorted(list(df.columns)) cols = list(df.columns) cols.insert(0, cols.pop(cols.index("Model"))) df = df[cols] if rank: df = add_rank(df, compute_average=True) if fillna: df.fillna("", inplace=True) return df ZBENCH_ZERO_SHOT = get_data_zbench(eval_mode="zero_shot") ZBENCH_FIVE_SHOT = get_data_zbench(eval_mode="five_shot") # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = def get_data_indommlu(eval_mode='zero_shot', fillna=True, rank=True): df_list = [] for model in MODEL_LIST: try: results_list = [ALL_RESULTS[model][eval_mode]['indommlu'][res] for res in ALL_RESULTS[model][eval_mode]['indommlu']] accuracy = median([results['accuracy'] for results in results_list]) res = { "Model Size (Params)": MODEL_TO_SIZE.get(model, ""), "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), "Accuracy": accuracy, } df_list.append(res) except: print('Not found in model: {} for {}'.format(model, "indommlu")) df = pd.DataFrame(df_list) # If there are any models that are the same, merge them # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one df = df.groupby("Model", as_index=False).first() # Put 'Model' column first #cols = sorted(list(df.columns)) cols = list(df.columns) cols.insert(0, cols.pop(cols.index("Model"))) df = df[cols] if rank: df = add_rank(df, compute_average=True) if fillna: df.fillna("", inplace=True) return df INDOMMLU_ZERO_SHOT = get_data_indommlu(eval_mode="zero_shot") INDOMMLU_FIVE_SHOT = get_data_indommlu(eval_mode="five_shot") # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = def get_data_ind_emotion(eval_mode='zero_shot', fillna=True, rank=True): df_list = [] for model in MODEL_LIST: try: results_list = [ALL_RESULTS[model][eval_mode]['ind_emotion'][res] for res in ALL_RESULTS[model][eval_mode]['ind_emotion']] accuracy = median([results['accuracy'] for results in results_list]) res = { "Model Size (Params)": MODEL_TO_SIZE.get(model, ""), "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), "Accuracy": accuracy, } df_list.append(res) except: print('Not found in model: {} for {}'.format(model, "ind_emotion")) df = pd.DataFrame(df_list) # If there are any models that are the same, merge them # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one df = df.groupby("Model", as_index=False).first() # Put 'Model' column first #cols = sorted(list(df.columns)) cols = list(df.columns) cols.insert(0, cols.pop(cols.index("Model"))) df = df[cols] if rank: df = add_rank(df, compute_average=True) if fillna: df.fillna("", inplace=True) return df IND_EMOTION_ZERO_SHOT = get_data_ind_emotion(eval_mode="zero_shot") IND_EMOTION_FIVE_SHOT = get_data_ind_emotion(eval_mode="five_shot") # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = def get_data_ocnli(eval_mode='zero_shot', fillna=True, rank=True): df_list = [] for model in MODEL_LIST: try: results_list = [ALL_RESULTS[model][eval_mode]['ocnli'][res] for res in ALL_RESULTS[model][eval_mode]['ocnli']] accuracy = median([results['accuracy'] for results in results_list]) res = { "Model Size (Params)": MODEL_TO_SIZE.get(model, ""), "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), "Accuracy": accuracy, } df_list.append(res) except: print('Not found in model: {} for {}'.format(model, "ocnli")) df = pd.DataFrame(df_list) # If there are any models that are the same, merge them # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one df = df.groupby("Model", as_index=False).first() # Put 'Model' column first #cols = sorted(list(df.columns)) cols = list(df.columns) cols.insert(0, cols.pop(cols.index("Model"))) df = df[cols] if rank: df = add_rank(df, compute_average=True) if fillna: df.fillna("", inplace=True) return df OCNLI_ZERO_SHOT = get_data_ocnli(eval_mode="zero_shot") OCNLI_FIVE_SHOT = get_data_ocnli(eval_mode="five_shot") # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = def get_data_c3(eval_mode='zero_shot', fillna=True, rank=True): df_list = [] for model in MODEL_LIST: try: results_list = [ALL_RESULTS[model][eval_mode]['c3'][res] for res in ALL_RESULTS[model][eval_mode]['c3']] accuracy = median([results['accuracy'] for results in results_list]) res = { "Model Size (Params)": MODEL_TO_SIZE.get(model, ""), "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), "Accuracy": accuracy, } df_list.append(res) except: print('Not found in model: {} for {}'.format(model, "c3")) df = pd.DataFrame(df_list) # If there are any models that are the same, merge them # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one df = df.groupby("Model", as_index=False).first() # Put 'Model' column first #cols = sorted(list(df.columns)) cols = list(df.columns) cols.insert(0, cols.pop(cols.index("Model"))) df = df[cols] if rank: df = add_rank(df, compute_average=True) if fillna: df.fillna("", inplace=True) return df C3_ZERO_SHOT = get_data_c3(eval_mode="zero_shot") C3_FIVE_SHOT = get_data_c3(eval_mode="five_shot") # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = def get_data_dream(eval_mode='zero_shot', fillna=True, rank=True): df_list = [] for model in MODEL_LIST: try: results_list = [ALL_RESULTS[model][eval_mode]['dream'][res] for res in ALL_RESULTS[model][eval_mode]['dream']] accuracy = median([results['accuracy'] for results in results_list]) res = { "Model Size (Params)": MODEL_TO_SIZE.get(model, ""), "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), "Accuracy": accuracy, } df_list.append(res) except: print('Not found in model: {} for {}'.format(model, "dream")) df = pd.DataFrame(df_list) # If there are any models that are the same, merge them # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one df = df.groupby("Model", as_index=False).first() # Put 'Model' column first #cols = sorted(list(df.columns)) cols = list(df.columns) cols.insert(0, cols.pop(cols.index("Model"))) df = df[cols] if rank: df = add_rank(df, compute_average=True) if fillna: df.fillna("", inplace=True) return df DREAM_ZERO_SHOT = get_data_dream(eval_mode="zero_shot") DREAM_FIVE_SHOT = get_data_dream(eval_mode="five_shot") # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = def get_data_samsum(eval_mode='zero_shot', fillna=True, rank=True): df_list = [] for model in MODEL_LIST: try: results_list = [ALL_RESULTS[model][eval_mode]['samsum'][res] for res in ALL_RESULTS[model][eval_mode]['samsum']] rouge1 = median([results['rouge1'] for results in results_list]) rouge2 = median([results['rouge2'] for results in results_list]) rougeL = median([results['rougeL'] for results in results_list]) res = { "Model Size (Params)": MODEL_TO_SIZE.get(model, ""), "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), "ROUGE-1": rouge1, "ROUGE-2": rouge2, "ROUGE-L": rougeL, } df_list.append(res) except: print('Not found in model: {} for {}'.format(model, "samsum")) df = pd.DataFrame(df_list) # If there are any models that are the same, merge them # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one df = df.groupby("Model", as_index=False).first() # Put 'Model' column first #cols = sorted(list(df.columns)) cols = list(df.columns) cols.insert(0, cols.pop(cols.index("Model"))) df = df[cols] if rank: df = add_rank(df, compute_average=True) if fillna: df.fillna("", inplace=True) return df SAMSUM_ZERO_SHOT = get_data_samsum(eval_mode="zero_shot") SAMSUM_FIVE_SHOT = get_data_samsum(eval_mode="five_shot") # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = def get_data_dialogsum(eval_mode='zero_shot', fillna=True, rank=True): df_list = [] for model in MODEL_LIST: try: results_list = [ALL_RESULTS[model][eval_mode]['dialogsum'][res] for res in ALL_RESULTS[model][eval_mode]['dialogsum']] rouge1 = median([results['rouge1'] for results in results_list]) rouge2 = median([results['rouge2'] for results in results_list]) rougeL = median([results['rougeL'] for results in results_list]) res = { "Model Size (Params)": MODEL_TO_SIZE.get(model, ""), "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), "ROUGE-1": rouge1, "ROUGE-2": rouge2, "ROUGE-L": rougeL, } df_list.append(res) except: print('Not found in model: {} for {}'.format(model, "dialogsum")) df = pd.DataFrame(df_list) # If there are any models that are the same, merge them # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one df = df.groupby("Model", as_index=False).first() # Put 'Model' column first #cols = sorted(list(df.columns)) cols = list(df.columns) cols.insert(0, cols.pop(cols.index("Model"))) df = df[cols] if rank: df = add_rank(df, compute_average=True) if fillna: df.fillna("", inplace=True) return df DIALOGSUM_ZERO_SHOT = get_data_dialogsum(eval_mode="zero_shot") DIALOGSUM_FIVE_SHOT = get_data_dialogsum(eval_mode="five_shot") # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = def get_data_sst2(eval_mode='zero_shot', fillna=True, rank=True): df_list = [] for model in MODEL_LIST: try: results_list = [ALL_RESULTS[model][eval_mode]['sst2'][res] for res in ALL_RESULTS[model][eval_mode]['sst2']] accuracy = median([results['accuracy'] for results in results_list]) res = { "Model Size (Params)": MODEL_TO_SIZE.get(model, ""), "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), "Accuracy": accuracy, } df_list.append(res) except: print('Not found in model: {} for {}'.format(model, "sst2")) df = pd.DataFrame(df_list) # If there are any models that are the same, merge them # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one df = df.groupby("Model", as_index=False).first() # Put 'Model' column first #cols = sorted(list(df.columns)) cols = list(df.columns) cols.insert(0, cols.pop(cols.index("Model"))) df = df[cols] if rank: df = add_rank(df, compute_average=True) if fillna: df.fillna("", inplace=True) return df SST2_ZERO_SHOT = get_data_sst2(eval_mode="zero_shot") SST2_FIVE_SHOT = get_data_sst2(eval_mode="five_shot") # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = def get_data_cola(eval_mode='zero_shot', fillna=True, rank=True): df_list = [] for model in MODEL_LIST: try: results_list = [ALL_RESULTS[model][eval_mode]['cola'][res] for res in ALL_RESULTS[model][eval_mode]['cola']] accuracy = median([results['accuracy'] for results in results_list]) res = { "Model Size (Params)": MODEL_TO_SIZE.get(model, ""), "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), "Accuracy": accuracy, } df_list.append(res) except: print('Not found in model: {} for {}'.format(model, "cola")) df = pd.DataFrame(df_list) # If there are any models that are the same, merge them # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one df = df.groupby("Model", as_index=False).first() # Put 'Model' column first #cols = sorted(list(df.columns)) cols = list(df.columns) cols.insert(0, cols.pop(cols.index("Model"))) df = df[cols] if rank: df = add_rank(df, compute_average=True) if fillna: df.fillna("", inplace=True) return df COLA_ZERO_SHOT = get_data_cola(eval_mode="zero_shot") COLA_FIVE_SHOT = get_data_cola(eval_mode="five_shot") # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = def get_data_qqp(eval_mode='zero_shot', fillna=True, rank=True): df_list = [] for model in MODEL_LIST: try: results_list = [ALL_RESULTS[model][eval_mode]['qqp'][res] for res in ALL_RESULTS[model][eval_mode]['qqp']] accuracy = median([results['accuracy'] for results in results_list]) res = { "Model Size (Params)": MODEL_TO_SIZE.get(model, ""), "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), "Accuracy": accuracy, } df_list.append(res) except: print('Not found in model: {} for {}'.format(model, "qqp")) df = pd.DataFrame(df_list) # If there are any models that are the same, merge them # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one df = df.groupby("Model", as_index=False).first() # Put 'Model' column first #cols = sorted(list(df.columns)) cols = list(df.columns) cols.insert(0, cols.pop(cols.index("Model"))) df = df[cols] if rank: df = add_rank(df, compute_average=True) if fillna: df.fillna("", inplace=True) return df QQP_ZERO_SHOT = get_data_qqp(eval_mode="zero_shot") QQP_FIVE_SHOT = get_data_qqp(eval_mode="five_shot") # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = def get_data_mnli(eval_mode='zero_shot', fillna=True, rank=True): df_list = [] for model in MODEL_LIST: try: results_list = [ALL_RESULTS[model][eval_mode]['mnli'][res] for res in ALL_RESULTS[model][eval_mode]['mnli']] accuracy = median([results['accuracy'] for results in results_list]) res = { "Model Size (Params)": MODEL_TO_SIZE.get(model, ""), "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), "Accuracy": accuracy, } df_list.append(res) except: print('Not found in model: {} for {}'.format(model, "mnli")) df = pd.DataFrame(df_list) # If there are any models that are the same, merge them # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one df = df.groupby("Model", as_index=False).first() # Put 'Model' column first #cols = sorted(list(df.columns)) cols = list(df.columns) cols.insert(0, cols.pop(cols.index("Model"))) df = df[cols] if rank: df = add_rank(df, compute_average=True) if fillna: df.fillna("", inplace=True) return df MNLI_ZERO_SHOT = get_data_mnli(eval_mode="zero_shot") MNLI_FIVE_SHOT = get_data_mnli(eval_mode="five_shot") # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = def get_data_qnli(eval_mode='zero_shot', fillna=True, rank=True): df_list = [] for model in MODEL_LIST: try: results_list = [ALL_RESULTS[model][eval_mode]['qnli'][res] for res in ALL_RESULTS[model][eval_mode]['qnli']] accuracy = median([results['accuracy'] for results in results_list]) res = { "Model Size (Params)": MODEL_TO_SIZE.get(model, ""), "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), "Accuracy": accuracy, } df_list.append(res) except: print('Not found in model: {} for {}'.format(model, "qnli")) df = pd.DataFrame(df_list) # If there are any models that are the same, merge them # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one df = df.groupby("Model", as_index=False).first() # Put 'Model' column first #cols = sorted(list(df.columns)) cols = list(df.columns) cols.insert(0, cols.pop(cols.index("Model"))) df = df[cols] if rank: df = add_rank(df, compute_average=True) if fillna: df.fillna("", inplace=True) return df QNLI_ZERO_SHOT = get_data_qnli(eval_mode="zero_shot") QNLI_FIVE_SHOT = get_data_qnli(eval_mode="five_shot") # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = def get_data_wnli(eval_mode='zero_shot', fillna=True, rank=True): df_list = [] for model in MODEL_LIST: try: results_list = [ALL_RESULTS[model][eval_mode]['wnli'][res] for res in ALL_RESULTS[model][eval_mode]['wnli']] accuracy = median([results['accuracy'] for results in results_list]) res = { "Model Size (Params)": MODEL_TO_SIZE.get(model, ""), "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), "Accuracy": accuracy, } df_list.append(res) except: print('Not found in model: {} for {}'.format(model, "wnli")) df = pd.DataFrame(df_list) # If there are any models that are the same, merge them # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one df = df.groupby("Model", as_index=False).first() # Put 'Model' column first #cols = sorted(list(df.columns)) cols = list(df.columns) cols.insert(0, cols.pop(cols.index("Model"))) df = df[cols] if rank: df = add_rank(df, compute_average=True) if fillna: df.fillna("", inplace=True) return df WNLI_ZERO_SHOT = get_data_wnli(eval_mode="zero_shot") WNLI_FIVE_SHOT = get_data_wnli(eval_mode="five_shot") # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = def get_data_rte(eval_mode='zero_shot', fillna=True, rank=True): df_list = [] for model in MODEL_LIST: try: results_list = [ALL_RESULTS[model][eval_mode]['rte'][res] for res in ALL_RESULTS[model][eval_mode]['rte']] accuracy = median([results['accuracy'] for results in results_list]) res = { "Model Size (Params)": MODEL_TO_SIZE.get(model, ""), "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), "Accuracy": accuracy, } df_list.append(res) except: print('Not found in model: {} for {}'.format(model, "rte")) df = pd.DataFrame(df_list) # If there are any models that are the same, merge them # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one df = df.groupby("Model", as_index=False).first() # Put 'Model' column first #cols = sorted(list(df.columns)) cols = list(df.columns) cols.insert(0, cols.pop(cols.index("Model"))) df = df[cols] if rank: df = add_rank(df, compute_average=True) if fillna: df.fillna("", inplace=True) return df RTE_ZERO_SHOT = get_data_rte(eval_mode="zero_shot") RTE_FIVE_SHOT = get_data_rte(eval_mode="five_shot") # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = def get_data_mrpc(eval_mode='zero_shot', fillna=True, rank=True): df_list = [] for model in MODEL_LIST: try: results_list = [ALL_RESULTS[model][eval_mode]['mrpc'][res] for res in ALL_RESULTS[model][eval_mode]['mrpc']] accuracy = median([results['accuracy'] for results in results_list]) res = { "Model Size (Params)": MODEL_TO_SIZE.get(model, ""), "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]), "Accuracy": accuracy, } df_list.append(res) except: print('Not found in model: {} for {}'.format(model, "mrpc")) df = pd.DataFrame(df_list) # If there are any models that are the same, merge them # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one df = df.groupby("Model", as_index=False).first() # Put 'Model' column first #cols = sorted(list(df.columns)) cols = list(df.columns) cols.insert(0, cols.pop(cols.index("Model"))) df = df[cols] if rank: df = add_rank(df, compute_average=True) if fillna: df.fillna("", inplace=True) return df MRPC_ZERO_SHOT = get_data_mrpc(eval_mode="zero_shot") MRPC_FIVE_SHOT = get_data_mrpc(eval_mode="five_shot") # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = theme = gr.themes.Soft().set( background_fill_primary='*secondary_50' ) block = gr.Blocks(theme='rottenlittlecreature/Moon_Goblin') with block: gr.Markdown(f""" ### SeaEval Leaderboard. To submit, refer to the SeaEval Website. Refer to the [SeaEval paper](https://arxiv.org/abs/2309.04766) for details on metrics, tasks and models. - **Number of Datasets**: > 30 - **Number of Languages**: > 8 - **Number of Models**: {NUM_MODELS} - **Mode of Evaluation**: Zero-Shot, Five-Shot ### The following table shows the performance of the models on the SeaEval benchmark. - For **Zero-Shot** performance, it is the median value from 5 distinct prompts shown on the above leaderboard to mitigate the influence of random variations induced by prompts. - I am trying to evaluate the base models for five-shot performance and instruction-tuned models for zero-shot. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - """) with gr.Tabs(): with gr.TabItem("Cross-Lingual Consistency"): # dataset 1: cross-mmlu # dataset 1: cross-mmlu with gr.TabItem("Cross-MMLU"): with gr.TabItem("Zero Shot"): with gr.TabItem("Overall"): with gr.Row(): cross_mmlu_zero_shot_overall = gr.components.Dataframe( CROSS_MMLU_ZERO_SHOT_OVERALL, datatype=["number", "markdown"] + ["number"] * len(CROSS_MMLU_ZERO_SHOT_OVERALL.columns), type="pandas", ) with gr.TabItem("Language Performance"): with gr.Row(): cross_mmlu_zero_shot_overall = gr.components.Dataframe( CROSS_MMLU_ZERO_SHOT_LANGUAGE, datatype=["number", "markdown"] + ["number"] * len(CROSS_MMLU_ZERO_SHOT_LANGUAGE.columns), type="pandas", ) with gr.TabItem("Five Shot"): with gr.TabItem("Overall"): with gr.Row(): cross_mmlu_zero_shot_overall = gr.components.Dataframe( CROSS_MMLU_FIVE_SHOT_OVERALL, datatype=["number", "markdown"] + ["number"] * len(CROSS_MMLU_FIVE_SHOT_OVERALL.columns), type="pandas", ) with gr.TabItem("Language Performance"): with gr.Row(): gr.components.Dataframe( CROSS_MMLU_FIVE_SHOT_LANGUAGE, datatype=["number", "markdown"] + ["number"] * len(CROSS_MMLU_FIVE_SHOT_LANGUAGE.columns), type="pandas", ) with gr.Row(): gr.Markdown(""" **Cross-MMLU Leaderboard** 🔮 - **Metric:** Cross-Lingual Consistency, Accuracy, AC3 - **Languages:** English, Chinese, Malay, Indonesian, Spanish, Vietnamese, Filipino """) with gr.TabItem("Cross-XQUAD"): with gr.TabItem("Zero Shot"): with gr.TabItem("Overall"): with gr.Row(): cross_xquad_zero_shot_overall = gr.components.Dataframe( CROSS_XQUAD_ZERO_SHOT_OVERALL, datatype=["number", "markdown"] + ["number"] * len(CROSS_XQUAD_ZERO_SHOT_OVERALL.columns), type="pandas", ) with gr.TabItem("Language Performance"): with gr.Row(): cross_xquad_zero_shot_overall = gr.components.Dataframe( CROSS_XQUAD_ZERO_SHOT_LANGUAGE, datatype=["number", "markdown"] + ["number"] * len(CROSS_XQUAD_ZERO_SHOT_LANGUAGE.columns), type="pandas", ) with gr.TabItem("Five Shot"): with gr.TabItem("Overall"): with gr.Row(): cross_xquad_zero_shot_overall = gr.components.Dataframe( CROSS_XQUAD_FIVE_SHOT_OVERALL, datatype=["number", "markdown"] + ["number"] * len(CROSS_XQUAD_FIVE_SHOT_OVERALL.columns), type="pandas", ) with gr.TabItem("Language Performance"): with gr.Row(): gr.components.Dataframe( CROSS_XQUAD_FIVE_SHOT_LANGUAGE, datatype=["number", "markdown"] + ["number"] * len(CROSS_XQUAD_FIVE_SHOT_LANGUAGE.columns), type="pandas", ) with gr.Row(): gr.Markdown(""" **Cross-XQUAD Leaderboard** 🔮 - **Metric:** Cross-Lingual Consistency, Accuracy, AC3 - **Languages:** English, Chinese, Spanish, Vietnamese """) # dataset 2: cross-logiqa with gr.TabItem("Cross-LogiQA"): with gr.TabItem("Zero Shot"): with gr.TabItem("Overall"): with gr.Row(): gr.components.Dataframe( CROSS_LOGIQA_ZERO_SHOT_OVERALL, datatype=["number", "markdown"] + ["number"] * len(CROSS_LOGIQA_ZERO_SHOT_OVERALL.columns), type="pandas", ) with gr.TabItem("Language Performance"): with gr.Row(): gr.components.Dataframe( CROSS_LOGIQA_ZERO_SHOT_LANGUAGE, datatype=["number", "markdown"] + ["number"] * len(CROSS_LOGIQA_ZERO_SHOT_LANGUAGE.columns), type="pandas", ) with gr.TabItem("Five Shot"): with gr.TabItem("Overall"): with gr.Row(): gr.components.Dataframe( CROSS_LOGIQA_FIVE_SHOT_OVERALL, datatype=["number", "markdown"] + ["number"] * len(CROSS_LOGIQA_FIVE_SHOT_OVERALL.columns), type="pandas", ) with gr.TabItem("Language Performance"): with gr.Row(): gr.components.Dataframe( CROSS_LOGIQA_FIVE_SHOT_LANGUAGE, datatype=["number", "markdown"] + ["number"] * len(CROSS_LOGIQA_FIVE_SHOT_LANGUAGE.columns), type="pandas", ) with gr.Row(): gr.Markdown(""" **Cross-LogiQA Leaderboard** 🔮 - **Metric:** Cross-Lingual Consistency, Accuracy, AC3 - **Languages:** English, Chinese, Malay, Indonesian, Spanish, Vietnamese, Filipino """) with gr.TabItem("Cultural Reasoning"): # dataset 3: SG_EVAL with gr.TabItem("SG_EVAL"): with gr.TabItem("Zero Shot"): with gr.TabItem("Overall"): with gr.Row(): gr.components.Dataframe( SG_EVAL_ZERO_SHOT, datatype=["number", "markdown"] + ["number"] * len(SG_EVAL_ZERO_SHOT.columns), type="pandas", ) with gr.TabItem("Five Shot"): with gr.TabItem("Overall"): with gr.Row(): gr.components.Dataframe( SG_EVAL_FIVE_SHOT, datatype=["number", "markdown"] + ["number"] * len(SG_EVAL_FIVE_SHOT.columns), type="pandas", ) with gr.Row(): gr.Markdown(""" **SG_EVAL Leaderboard** 🔮 - **Metric:** Accuracy - **Languages:** English """) # dataset 4: with gr.TabItem("US_EVAL"): with gr.TabItem("Zero Shot"): with gr.TabItem("Overall"): with gr.Row(): gr.components.Dataframe( US_EVAL_ZERO_SHOT, datatype=["number", "markdown"] + ["number"] * len(US_EVAL_ZERO_SHOT.columns), type="pandas", ) with gr.TabItem("Five Shot"): with gr.TabItem("Overall"): with gr.Row(): gr.components.Dataframe( US_EVAL_FIVE_SHOT, datatype=["number", "markdown"] + ["number"] * len(US_EVAL_FIVE_SHOT.columns), type="pandas", ) with gr.Row(): gr.Markdown(""" **US_EVAL Leaderboard** 🔮 - **Metric:** Accuracy - **Languages:** English """) # dataset 5: with gr.TabItem("CN_EVAL"): with gr.TabItem("Zero Shot"): with gr.TabItem("Overall"): with gr.Row(): gr.components.Dataframe( CN_EVAL_ZERO_SHOT, datatype=["number", "markdown"] + ["number"] * len(CN_EVAL_ZERO_SHOT.columns), type="pandas", ) with gr.TabItem("Five Shot"): with gr.TabItem("Overall"): with gr.Row(): gr.components.Dataframe( CN_EVAL_FIVE_SHOT, datatype=["number", "markdown"] + ["number"] * len(CN_EVAL_FIVE_SHOT.columns), type="pandas", ) with gr.Row(): gr.Markdown(""" **CN_EVAL Leaderboard** 🔮 - **Metric:** Accuracy - **Languages:** Chinese """) # dataset 6: with gr.TabItem("PH_EVAL"): with gr.TabItem("Zero Shot"): with gr.TabItem("Overall"): with gr.Row(): gr.components.Dataframe( PH_EVAL_ZERO_SHOT, datatype=["number", "markdown"] + ["number"] * len(PH_EVAL_ZERO_SHOT.columns), type="pandas", ) with gr.TabItem("Five Shot"): with gr.TabItem("Overall"): with gr.Row(): gr.components.Dataframe( PH_EVAL_FIVE_SHOT, datatype=["number", "markdown"] + ["number"] * len(PH_EVAL_FIVE_SHOT.columns), type="pandas", ) with gr.Row(): gr.Markdown(""" **PH_EVAL Leaderboard** 🔮 - **Metric:** Accuracy - **Languages:** English """) # dataset 7: with gr.TabItem("Singlish to English Translation"): with gr.TabItem("Zero Shot"): with gr.TabItem("Overall"): with gr.Row(): gr.components.Dataframe( SING2ENG_ZERO_SHOT, datatype=["number", "markdown"] + ["number"] * len(SING2ENG_ZERO_SHOT.columns), type="pandas", ) with gr.TabItem("Five Shot"): with gr.TabItem("Overall"): with gr.Row(): gr.components.Dataframe( SING2ENG_FIVE_SHOT, datatype=["number", "markdown"] + ["number"] * len(SING2ENG_FIVE_SHOT.columns), type="pandas", ) with gr.Row(): gr.Markdown(""" **SING2ENG Leaderboard** 🔮 - **Metric:** BLEU Avg. - **Languages:** English """) with gr.TabItem("General Reasoning"): # dataset 12: with gr.TabItem("MMLU Subset"): with gr.TabItem("Zero Shot"): with gr.TabItem("Overall"): with gr.Row(): gr.components.Dataframe( MMLU_ZERO_SHOT, datatype=["number", "markdown"] + ["number"] * len(MMLU_ZERO_SHOT.columns), type="pandas", ) with gr.TabItem("Five Shot"): with gr.TabItem("Overall"): with gr.Row(): gr.components.Dataframe( MMLU_FIVE_SHOT, datatype=["number", "markdown"] + ["number"] * len(MMLU_FIVE_SHOT.columns), type="pandas", ) with gr.Row(): gr.Markdown(""" **MMLU Leaderboard** 🔮 - **Metric:** Accuracy. - **Languages:** English """) # dataset 13: with gr.TabItem("MMLU Full"): with gr.TabItem("Zero Shot"): with gr.TabItem("Overall"): with gr.Row(): gr.components.Dataframe( MMLU_FULL_ZERO_SHOT, datatype=["number", "markdown"] + ["number"] * len(MMLU_FULL_ZERO_SHOT.columns), type="pandas", ) with gr.TabItem("Five Shot"): with gr.TabItem("Overall"): with gr.Row(): gr.components.Dataframe( MMLU_FULL_FIVE_SHOT, datatype=["number", "markdown"] + ["number"] * len(MMLU_FULL_FIVE_SHOT.columns), type="pandas", ) with gr.Row(): gr.Markdown(""" **MMLU Full Leaderboard** 🔮 - **Metric:** Accuracy. - **Languages:** English """) # dataset 14: with gr.TabItem("C_EVAL Subset"): with gr.TabItem("Zero Shot"): with gr.TabItem("Overall"): with gr.Row(): gr.components.Dataframe( C_EVAL_ZERO_SHOT, datatype=["number", "markdown"] + ["number"] * len(C_EVAL_ZERO_SHOT.columns), type="pandas", ) with gr.TabItem("Five Shot"): with gr.TabItem("Overall"): with gr.Row(): gr.components.Dataframe( C_EVAL_FIVE_SHOT, datatype=["number", "markdown"] + ["number"] * len(C_EVAL_FIVE_SHOT.columns), type="pandas", ) with gr.Row(): gr.Markdown(""" **C_EVAL Leaderboard** 🔮 - **Metric:** Accuracy. - **Languages:** Chinese """) # dataset 15: with gr.TabItem("C_EVAL Full"): with gr.TabItem("Zero Shot"): with gr.TabItem("Overall"): with gr.Row(): gr.components.Dataframe( C_EVAL_FULL_ZERO_SHOT, datatype=["number", "markdown"] + ["number"] * len(C_EVAL_FULL_ZERO_SHOT.columns), type="pandas", ) with gr.TabItem("Five Shot"): with gr.TabItem("Overall"): with gr.Row(): gr.components.Dataframe( C_EVAL_FULL_FIVE_SHOT, datatype=["number", "markdown"] + ["number"] * len(C_EVAL_FULL_FIVE_SHOT.columns), type="pandas", ) with gr.Row(): gr.Markdown(""" **C_EVAL Full Leaderboard** 🔮 - **Metric:** Accuracy. - **Languages:** Chinese """) # dataset 16: with gr.TabItem("CMMLU Subset"): with gr.TabItem("Zero Shot"): with gr.TabItem("Overall"): with gr.Row(): gr.components.Dataframe( CMMLU_ZERO_SHOT, datatype=["number", "markdown"] + ["number"] * len(CMMLU_ZERO_SHOT.columns), type="pandas", ) with gr.TabItem("Five Shot"): with gr.TabItem("Overall"): with gr.Row(): gr.components.Dataframe( CMMLU_FIVE_SHOT, datatype=["number", "markdown"] + ["number"] * len(CMMLU_FIVE_SHOT.columns), type="pandas", ) with gr.Row(): gr.Markdown(""" **CMMLU Leaderboard** 🔮 - **Metric:** Accuracy. - **Languages:** Chinese """) # dataset 17: with gr.TabItem("CMMLU Full"): with gr.TabItem("Zero Shot"): with gr.TabItem("Overall"): with gr.Row(): gr.components.Dataframe( CMMLU_FULL_ZERO_SHOT, datatype=["number", "markdown"] + ["number"] * len(CMMLU_FULL_ZERO_SHOT.columns), type="pandas", ) with gr.TabItem("Five Shot"): with gr.TabItem("Overall"): with gr.Row(): gr.components.Dataframe( CMMLU_FULL_FIVE_SHOT, datatype=["number", "markdown"] + ["number"] * len(CMMLU_FULL_FIVE_SHOT.columns), type="pandas", ) with gr.Row(): gr.Markdown(""" **CMMLU Full Leaderboard** 🔮 - **Metric:** Accuracy. - **Languages:** Chinese """) # dataset 18: with gr.TabItem("ZBench"): with gr.TabItem("Zero Shot"): with gr.TabItem("Overall"): with gr.Row(): gr.components.Dataframe( ZBENCH_ZERO_SHOT, datatype=["number", "markdown"] + ["number"] * len(ZBENCH_ZERO_SHOT.columns), type="pandas", ) with gr.TabItem("Five Shot"): with gr.TabItem("Overall"): with gr.Row(): gr.components.Dataframe( ZBENCH_FIVE_SHOT, datatype=["number", "markdown"] + ["number"] * len(ZBENCH_FIVE_SHOT.columns), type="pandas", ) with gr.Row(): gr.Markdown(""" **ZBench Leaderboard** 🔮 - **Metric:** Accuracy. - **Languages:** Chinese """) # dataset 18: with gr.TabItem("IndoMMLU"): with gr.TabItem("Zero Shot"): with gr.TabItem("Overall"): with gr.Row(): gr.components.Dataframe( INDOMMLU_ZERO_SHOT, datatype=["number", "markdown"] + ["number"] * len(INDOMMLU_ZERO_SHOT.columns), type="pandas", ) with gr.TabItem("Five Shot"): with gr.TabItem("Overall"): with gr.Row(): gr.components.Dataframe( INDOMMLU_FIVE_SHOT, datatype=["number", "markdown"] + ["number"] * len(INDOMMLU_FIVE_SHOT.columns), type="pandas", ) with gr.Row(): gr.Markdown(""" **IndoMMLU Leaderboard** 🔮 - **Metric:** Accuracy. - **Languages:** Bahasa Indonesian """) with gr.TabItem("FLORES-Translation"): # dataset 8: with gr.TabItem("FLORES Indonesian to English Translation"): with gr.TabItem("Zero Shot"): with gr.TabItem("Overall"): with gr.Row(): gr.components.Dataframe( FLORES_IND2ENG_ZERO_SHOT, datatype=["number", "markdown"] + ["number"] * len(FLORES_IND2ENG_ZERO_SHOT.columns), type="pandas", ) with gr.TabItem("Five Shot"): with gr.TabItem("Overall"): with gr.Row(): gr.components.Dataframe( FLORES_IND2ENG_FIVE_SHOT, datatype=["number", "markdown"] + ["number"] * len(FLORES_IND2ENG_FIVE_SHOT.columns), type="pandas", ) with gr.Row(): gr.Markdown(""" **flores_ind2eng Leaderboard** 🔮 - **Metric:** BLEU Avg. - **Languages:** English """) # dataset 9: with gr.TabItem("FLORES Vitenamese to English Translation"): with gr.TabItem("Zero Shot"): with gr.TabItem("Overall"): with gr.Row(): gr.components.Dataframe( FLORES_VIE2ENG_ZERO_SHOT, datatype=["number", "markdown"] + ["number"] * len(FLORES_VIE2ENG_ZERO_SHOT.columns), type="pandas", ) with gr.TabItem("Five Shot"): with gr.TabItem("Overall"): with gr.Row(): gr.components.Dataframe( FLORES_VIE2ENG_FIVE_SHOT, datatype=["number", "markdown"] + ["number"] * len(FLORES_VIE2ENG_FIVE_SHOT.columns), type="pandas", ) with gr.Row(): gr.Markdown(""" **flores_vie2eng Leaderboard** 🔮 - **Metric:** BLEU Avg. - **Languages:** English """) # dataset 10: with gr.TabItem("FLORES Chinese to English Translation"): with gr.TabItem("Zero Shot"): with gr.TabItem("Overall"): with gr.Row(): gr.components.Dataframe( FLORES_ZHO2ENG_ZERO_SHOT, datatype=["number", "markdown"] + ["number"] * len(FLORES_ZHO2ENG_ZERO_SHOT.columns), type="pandas", ) with gr.TabItem("Five Shot"): with gr.TabItem("Overall"): with gr.Row(): gr.components.Dataframe( FLORES_ZHO2ENG_FIVE_SHOT, datatype=["number", "markdown"] + ["number"] * len(FLORES_ZHO2ENG_FIVE_SHOT.columns), type="pandas", ) with gr.Row(): gr.Markdown(""" **flores_zho2eng Leaderboard** 🔮 - **Metric:** BLEU Avg. - **Languages:** English """) # dataset 11: with gr.TabItem("FLORES Malay to English Translation"): with gr.TabItem("Zero Shot"): with gr.TabItem("Overall"): with gr.Row(): gr.components.Dataframe( FLORES_ZSM2ENG_ZERO_SHOT, datatype=["number", "markdown"] + ["number"] * len(FLORES_ZSM2ENG_ZERO_SHOT.columns), type="pandas", ) with gr.TabItem("Five Shot"): with gr.TabItem("Overall"): with gr.Row(): gr.components.Dataframe( FLORES_ZSM2ENG_FIVE_SHOT, datatype=["number", "markdown"] + ["number"] * len(FLORES_ZSM2ENG_FIVE_SHOT.columns), type="pandas", ) with gr.Row(): gr.Markdown(""" **flores_zsm2eng Leaderboard** 🔮 - **Metric:** BLEU Avg. - **Languages:** English """) with gr.TabItem("Emotion"): # dataset 18: with gr.TabItem("Indonesian Emotion Classification"): with gr.TabItem("Zero Shot"): with gr.TabItem("Overall"): with gr.Row(): gr.components.Dataframe( IND_EMOTION_ZERO_SHOT, datatype=["number", "markdown"] + ["number"] * len(IND_EMOTION_ZERO_SHOT.columns), type="pandas", ) with gr.TabItem("Five Shot"): with gr.TabItem("Overall"): with gr.Row(): gr.components.Dataframe( IND_EMOTION_FIVE_SHOT, datatype=["number", "markdown"] + ["number"] * len(IND_EMOTION_FIVE_SHOT.columns), type="pandas", ) with gr.Row(): gr.Markdown(""" **Ind_emotion Leaderboard** 🔮 - **Metric:** Accuracy. - **Languages:** Indonesian """) # dataset with gr.TabItem("SST2"): with gr.TabItem("Zero Shot"): with gr.TabItem("Overall"): with gr.Row(): gr.components.Dataframe( SST2_ZERO_SHOT, datatype=["number", "markdown"] + ["number"] * len(SST2_ZERO_SHOT.columns), type="pandas", ) with gr.TabItem("Five Shot"): with gr.TabItem("Overall"): with gr.Row(): gr.components.Dataframe( SST2_FIVE_SHOT, datatype=["number", "markdown"] + ["number"] * len(SST2_FIVE_SHOT.columns), type="pandas", ) with gr.Row(): gr.Markdown(""" **SST2 Leaderboard** 🔮 - **Metric:** Accuracy. - **Languages:** English """) with gr.TabItem("Dialogue"): # dataset with gr.TabItem("DREAM"): with gr.TabItem("Zero Shot"): with gr.TabItem("Overall"): with gr.Row(): gr.components.Dataframe( DREAM_ZERO_SHOT, datatype=["number", "markdown"] + ["number"] * len(DREAM_ZERO_SHOT.columns), type="pandas", ) with gr.TabItem("Five Shot"): with gr.TabItem("Overall"): with gr.Row(): gr.components.Dataframe( DREAM_FIVE_SHOT, datatype=["number", "markdown"] + ["number"] * len(DREAM_FIVE_SHOT.columns), type="pandas", ) with gr.Row(): gr.Markdown(""" **DREAM Leaderboard** 🔮 - **Metric:** Accuracy. - **Languages:** English """) # dataset with gr.TabItem("SAMSum"): with gr.TabItem("Zero Shot"): with gr.TabItem("Overall"): with gr.Row(): gr.components.Dataframe( SAMSUM_ZERO_SHOT, datatype=["number", "markdown"] + ["number"] * len(SAMSUM_ZERO_SHOT.columns), type="pandas", ) with gr.TabItem("Five Shot"): with gr.TabItem("Overall"): with gr.Row(): gr.components.Dataframe( SAMSUM_FIVE_SHOT, datatype=["number", "markdown"] + ["number"] * len(SAMSUM_FIVE_SHOT.columns), type="pandas", ) with gr.Row(): gr.Markdown(""" **SAMSum Leaderboard** 🔮 - **Metric:** ROUGE. - **Languages:** English """) # dataset with gr.TabItem("DialogSum"): with gr.TabItem("Zero Shot"): with gr.TabItem("Overall"): with gr.Row(): gr.components.Dataframe( DIALOGSUM_ZERO_SHOT, datatype=["number", "markdown"] + ["number"] * len(DIALOGSUM_ZERO_SHOT.columns), type="pandas", ) with gr.TabItem("Five Shot"): with gr.TabItem("Overall"): with gr.Row(): gr.components.Dataframe( DIALOGSUM_FIVE_SHOT, datatype=["number", "markdown"] + ["number"] * len(DIALOGSUM_FIVE_SHOT.columns), type="pandas", ) with gr.Row(): gr.Markdown(""" **DialogSum Leaderboard** 🔮 - **Metric:** ROUGE. - **Languages:** English """) with gr.TabItem("Fundamental NLP Tasks"): # dataset with gr.TabItem("OCNLI"): with gr.TabItem("Zero Shot"): with gr.TabItem("Overall"): with gr.Row(): gr.components.Dataframe( OCNLI_ZERO_SHOT, datatype=["number", "markdown"] + ["number"] * len(OCNLI_ZERO_SHOT.columns), type="pandas", ) with gr.TabItem("Five Shot"): with gr.TabItem("Overall"): with gr.Row(): gr.components.Dataframe( OCNLI_FIVE_SHOT, datatype=["number", "markdown"] + ["number"] * len(OCNLI_FIVE_SHOT.columns), type="pandas", ) with gr.Row(): gr.Markdown(""" **OCNLI Leaderboard** 🔮 - **Metric:** Accuracy. - **Languages:** Chinese """) # dataset with gr.TabItem("C3"): with gr.TabItem("Zero Shot"): with gr.TabItem("Overall"): with gr.Row(): gr.components.Dataframe( C3_ZERO_SHOT, datatype=["number", "markdown"] + ["number"] * len(C3_ZERO_SHOT.columns), type="pandas", ) with gr.TabItem("Five Shot"): with gr.TabItem("Overall"): with gr.Row(): gr.components.Dataframe( C3_FIVE_SHOT, datatype=["number", "markdown"] + ["number"] * len(C3_FIVE_SHOT.columns), type="pandas", ) with gr.Row(): gr.Markdown(""" **C3 Leaderboard** 🔮 - **Metric:** Accuracy. - **Languages:** Chinese """) # dataset with gr.TabItem("COLA"): with gr.TabItem("Zero Shot"): with gr.TabItem("Overall"): with gr.Row(): gr.components.Dataframe( COLA_ZERO_SHOT, datatype=["number", "markdown"] + ["number"] * len(COLA_ZERO_SHOT.columns), type="pandas", ) with gr.TabItem("Five Shot"): with gr.TabItem("Overall"): with gr.Row(): gr.components.Dataframe( COLA_FIVE_SHOT, datatype=["number", "markdown"] + ["number"] * len(COLA_FIVE_SHOT.columns), type="pandas", ) with gr.Row(): gr.Markdown(""" **COLA Leaderboard** 🔮 - **Metric:** Accuracy. - **Languages:** English """) # dataset with gr.TabItem("QQP"): with gr.TabItem("Zero Shot"): with gr.TabItem("Overall"): with gr.Row(): gr.components.Dataframe( QQP_ZERO_SHOT, datatype=["number", "markdown"] + ["number"] * len(QQP_ZERO_SHOT.columns), type="pandas", ) with gr.TabItem("Five Shot"): with gr.TabItem("Overall"): with gr.Row(): gr.components.Dataframe( QQP_FIVE_SHOT, datatype=["number", "markdown"] + ["number"] * len(QQP_FIVE_SHOT.columns), type="pandas", ) with gr.Row(): gr.Markdown(""" **QQP Leaderboard** 🔮 - **Metric:** Accuracy. - **Languages:** English """) # dataset with gr.TabItem("MNLI"): with gr.TabItem("Zero Shot"): with gr.TabItem("Overall"): with gr.Row(): gr.components.Dataframe( MNLI_ZERO_SHOT, datatype=["number", "markdown"] + ["number"] * len(MNLI_ZERO_SHOT.columns), type="pandas", ) with gr.TabItem("Five Shot"): with gr.TabItem("Overall"): with gr.Row(): gr.components.Dataframe( MNLI_FIVE_SHOT, datatype=["number", "markdown"] + ["number"] * len(MNLI_FIVE_SHOT.columns), type="pandas", ) with gr.Row(): gr.Markdown(""" **MNLI Leaderboard** 🔮 - **Metric:** Accuracy. - **Languages:** English """) # dataset with gr.TabItem("QNLI"): with gr.TabItem("Zero Shot"): with gr.TabItem("Overall"): with gr.Row(): gr.components.Dataframe( QNLI_ZERO_SHOT, datatype=["number", "markdown"] + ["number"] * len(QNLI_ZERO_SHOT.columns), type="pandas", ) with gr.TabItem("Five Shot"): with gr.TabItem("Overall"): with gr.Row(): gr.components.Dataframe( QNLI_FIVE_SHOT, datatype=["number", "markdown"] + ["number"] * len(QNLI_FIVE_SHOT.columns), type="pandas", ) with gr.Row(): gr.Markdown(""" **QNLI Leaderboard** 🔮 - **Metric:** Accuracy. - **Languages:** English """) # dataset with gr.TabItem("WNLI"): with gr.TabItem("Zero Shot"): with gr.TabItem("Overall"): with gr.Row(): gr.components.Dataframe( WNLI_ZERO_SHOT, datatype=["number", "markdown"] + ["number"] * len(WNLI_ZERO_SHOT.columns), type="pandas", ) with gr.TabItem("Five Shot"): with gr.TabItem("Overall"): with gr.Row(): gr.components.Dataframe( WNLI_FIVE_SHOT, datatype=["number", "markdown"] + ["number"] * len(WNLI_FIVE_SHOT.columns), type="pandas", ) with gr.Row(): gr.Markdown(""" **WNLI Leaderboard** 🔮 - **Metric:** Accuracy. - **Languages:** English """) # dataset with gr.TabItem("RTE"): with gr.TabItem("Zero Shot"): with gr.TabItem("Overall"): with gr.Row(): gr.components.Dataframe( RTE_ZERO_SHOT, datatype=["number", "markdown"] + ["number"] * len(RTE_ZERO_SHOT.columns), type="pandas", ) with gr.TabItem("Five Shot"): with gr.TabItem("Overall"): with gr.Row(): gr.components.Dataframe( RTE_FIVE_SHOT, datatype=["number", "markdown"] + ["number"] * len(RTE_FIVE_SHOT.columns), type="pandas", ) with gr.Row(): gr.Markdown(""" **RTE Leaderboard** 🔮 - **Metric:** Accuracy. - **Languages:** English """) # dataset with gr.TabItem("MRPC"): with gr.TabItem("Zero Shot"): with gr.TabItem("Overall"): with gr.Row(): gr.components.Dataframe( MRPC_ZERO_SHOT, datatype=["number", "markdown"] + ["number"] * len(MRPC_ZERO_SHOT.columns), type="pandas", ) with gr.TabItem("Five Shot"): with gr.TabItem("Overall"): with gr.Row(): gr.components.Dataframe( MRPC_FIVE_SHOT, datatype=["number", "markdown"] + ["number"] * len(MRPC_FIVE_SHOT.columns), type="pandas", ) with gr.Row(): gr.Markdown(""" **MRPC Leaderboard** 🔮 - **Metric:** Accuracy. - **Languages:** English """) gr.Markdown(r""" ### If our datasets and leaderboard are useful, please consider cite: ```bibtex @article{SeaEval, title={SeaEval for Multilingual Foundation Models: From Cross-Lingual Alignment to Cultural Reasoning}, author={Wang, Bin and Liu, Zhengyuan and Huang, Xin and Jiao, Fangkai and Ding, Yang and Aw, Ai Ti and Chen, Nancy F.}, journal={NAACL}, year={2024}} ``` """) block.queue(max_size=10) # block.launch(server_name="0.0.0.0", share=False) block.launch(server_name="0.0.0.0", share=True)