Spaces:

SeaEval
/

SeaEval_Leaderboard

Running

App Files Files Community

binwang commited on Apr 23

Commit

4687701

•

1 Parent(s): e90e78a

new format

Browse files

Files changed (1) hide show

app.py +394 -583

app.py CHANGED Viewed

@@ -55,12 +55,10 @@ def get_data_cross_xquad_overall(eval_mode='zero_shot', fillna=True, rank=True):
     df_list = []
     for model in MODEL_LIST:
-        results_list = [ALL_RESULTS[model][eval_mode]['cross_xquad'][res] for res in ALL_RESULTS[model][eval_mode]['cross_xquad']]
         try:
             overall_acc = [results['overall_acc'] for results in results_list]
             overall_acc = median(overall_acc)
@@ -70,20 +68,18 @@ def get_data_cross_xquad_overall(eval_mode='zero_shot', fillna=True, rank=True):
             AC3_3 = [results['AC3_3'] for results in results_list]
             AC3_3 = median(AC3_3)
-        except:
-            consistency_score_3 = -1
-            overall_acc = -1
-            AC3_3 = -1
-        res = {
-            "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
-            "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
-            "Accuracy": overall_acc,
-            "Cross-Lingual Consistency": consistency_score_3,
-            "AC3": AC3_3,
-        }
-        df_list.append(res)
     df = pd.DataFrame(df_list)
@@ -104,7 +100,6 @@ def get_data_cross_xquad_overall(eval_mode='zero_shot', fillna=True, rank=True):
     return df
 CROSS_XQUAD_ZERO_SHOT_OVERALL = get_data_cross_xquad_overall(eval_mode="zero_shot")
 CROSS_XQUAD_FIVE_SHOT_OVERALL = get_data_cross_xquad_overall(eval_mode="five_shot")
@@ -114,12 +109,10 @@ def get_data_cross_xquad_language(eval_mode='zero_shot', fillna=True, rank=True)
     df_list = []
     for model in MODEL_LIST:
-        results_list = [ALL_RESULTS[model][eval_mode]['cross_xquad'][res] for res in ALL_RESULTS[model][eval_mode]['cross_xquad']]
         try:
             English    = [results['language_acc']['English'] for results in results_list]
             Vietnamese = [results['language_acc']['Vietnamese'] for results in results_list]
             Chinese    = [results['language_acc']['Chinese'] for results in results_list]
@@ -130,23 +123,19 @@ def get_data_cross_xquad_language(eval_mode='zero_shot', fillna=True, rank=True)
             Chinese    = median(Chinese)
             Spanish    = median(Spanish)
-        except:
-            English = -1
-            Vietnamese = -1
-            Chinese = -1
-            Spanish = -1
-        res = {
-            "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
-            "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
-            "English": English,
-            "Vietnamese": Vietnamese,
-            "Chinese": Chinese,
-            "Spanish": Spanish,
-        }
-        df_list.append(res)
     df = pd.DataFrame(df_list)
@@ -167,7 +156,6 @@ def get_data_cross_xquad_language(eval_mode='zero_shot', fillna=True, rank=True)
     return df
 CROSS_XQUAD_ZERO_SHOT_LANGUAGE = get_data_cross_xquad_language(eval_mode="zero_shot")
 CROSS_XQUAD_FIVE_SHOT_LANGUAGE = get_data_cross_xquad_language(eval_mode="five_shot")
@@ -186,12 +174,11 @@ def get_data_cross_mmlu_overall(eval_mode='zero_shot', fillna=True, rank=True):
     df_list = []
     for model in MODEL_LIST:
-        results_list = [ALL_RESULTS[model][eval_mode]['cross_mmlu'][res] for res in ALL_RESULTS[model][eval_mode]['cross_mmlu']]
         try:
             overall_acc = [results['overall_acc'] for results in results_list]
             overall_acc = median(overall_acc)
@@ -201,20 +188,17 @@ def get_data_cross_mmlu_overall(eval_mode='zero_shot', fillna=True, rank=True):
             AC3_3 = [results['AC3_3'] for results in results_list]
             AC3_3 = median(AC3_3)
-        except:
-            consistency_score_3 = -1
-            overall_acc = -1
-            AC3_3 = -1
-        res = {
-            "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
-            "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
-            "Accuracy": overall_acc,
-            "Cross-Lingual Consistency": consistency_score_3,
-            "AC3": AC3_3,
-        }
-        df_list.append(res)
     df = pd.DataFrame(df_list)
@@ -235,7 +219,6 @@ def get_data_cross_mmlu_overall(eval_mode='zero_shot', fillna=True, rank=True):
     return df
 CROSS_MMLU_ZERO_SHOT_OVERALL = get_data_cross_mmlu_overall(eval_mode="zero_shot")
 CROSS_MMLU_FIVE_SHOT_OVERALL = get_data_cross_mmlu_overall(eval_mode="five_shot")
@@ -245,12 +228,11 @@ def get_data_cross_mmlu_language(eval_mode='zero_shot', fillna=True, rank=True):
     df_list = []
     for model in MODEL_LIST:
-        results_list = [ALL_RESULTS[model][eval_mode]['cross_mmlu'][res] for res in ALL_RESULTS[model][eval_mode]['cross_mmlu']]
-        try:
             English    = [results['language_acc']['English'] for results in results_list]
             Vietnamese = [results['language_acc']['Vietnamese'] for results in results_list]
             Chinese    = [results['language_acc']['Chinese'] for results in results_list]
@@ -267,30 +249,22 @@ def get_data_cross_mmlu_language(eval_mode='zero_shot', fillna=True, rank=True):
             Spanish    = median(Spanish)
             Malay      = median(Malay)
-        except:
-            English = -1
-            Vietnamese = -1
-            Chinese = -1
-            Indonesian = -1
-            Filipino = -1
-            Spanish = -1
-            Malay = -1
-        res = {
-            "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
-            "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
-            "English": English,
-            "Vietnamese": Vietnamese,
-            "Chinese": Chinese,
-            "Indonesian": Indonesian,
-            "Filipino": Filipino,
-            "Spanish": Spanish,
-            "Malay": Malay,
-        }
-        df_list.append(res)
     df = pd.DataFrame(df_list)
     # If there are any models that are the same, merge them
@@ -310,7 +284,6 @@ def get_data_cross_mmlu_language(eval_mode='zero_shot', fillna=True, rank=True):
     return df
 CROSS_MMLU_ZERO_SHOT_LANGUAGE = get_data_cross_mmlu_language(eval_mode="zero_shot")
 CROSS_MMLU_FIVE_SHOT_LANGUAGE = get_data_cross_mmlu_language(eval_mode="five_shot")
@@ -325,12 +298,11 @@ def get_data_cross_logiqa_overall(eval_mode='zero_shot', fillna=True, rank=True)
     df_list = []
     for model in MODEL_LIST:
-        results_list = [ALL_RESULTS[model][eval_mode]['cross_logiqa'][res] for res in ALL_RESULTS[model][eval_mode]['cross_logiqa']]
-        try:
             overall_acc = [results['overall_acc'] for results in results_list]
             overall_acc = median(overall_acc)
@@ -340,20 +312,18 @@ def get_data_cross_logiqa_overall(eval_mode='zero_shot', fillna=True, rank=True)
             AC3_3 = [results['AC3_3'] for results in results_list]
             AC3_3 = median(AC3_3)
-        except:
-            consistency_score_3 = -1
-            overall_acc = -1
-            AC3_3 = -1
-        res = {
-            "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
-            "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
-            "Accuracy": overall_acc,
-            "Cross-Lingual Consistency": consistency_score_3,
-            "AC3": AC3_3,
-        }
-        df_list.append(res)
     df = pd.DataFrame(df_list)
@@ -384,12 +354,11 @@ def get_data_cross_logiqa_language(eval_mode='zero_shot', fillna=True, rank=True
     df_list = []
     for model in MODEL_LIST:
-        results_list = [ALL_RESULTS[model][eval_mode]['cross_logiqa'][res] for res in ALL_RESULTS[model][eval_mode]['cross_logiqa']]
-        try:
             English    = [results['language_acc']['English'] for results in results_list]
             Vietnamese = [results['language_acc']['Vietnamese'] for results in results_list]
             Chinese    = [results['language_acc']['Chinese'] for results in results_list]
@@ -406,30 +375,24 @@ def get_data_cross_logiqa_language(eval_mode='zero_shot', fillna=True, rank=True
             Spanish    = median(Spanish)
             Malay      = median(Malay)
-        except:
-            English = -1
-            Vietnamese = -1
-            Chinese = -1
-            Indonesian = -1
-            Filipino = -1
-            Spanish = -1
-            Malay = -1
-        res = {
-            "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
-            "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
-            "English": English,
-            "Vietnamese": Vietnamese,
-            "Chinese": Chinese,
-            "Indonesian": Indonesian,
-            "Filipino": Filipino,
-            "Spanish": Spanish,
-            "Malay": Malay,
-        }
-        df_list.append(res)
     df = pd.DataFrame(df_list)
     # If there are any models that are the same, merge them
@@ -462,24 +425,23 @@ def get_data_sg_eval(eval_mode='zero_shot', fillna=True, rank=True):
     df_list = []
     for model in MODEL_LIST:
-        results_list = [ALL_RESULTS[model][eval_mode]['sg_eval'][res] for res in ALL_RESULTS[model][eval_mode]['sg_eval']]
         try:
             accuracy = median([results['accuracy'] for results in results_list])
-        except:
-            accuracy = -1
-        res = {
-            "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
-            "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
-            "Accuracy": accuracy,
-        }
-        df_list.append(res)
     df = pd.DataFrame(df_list)
@@ -515,24 +477,20 @@ def get_data_us_eval(eval_mode='zero_shot', fillna=True, rank=True):
     for model in MODEL_LIST:
-        results_list = [ALL_RESULTS[model][eval_mode]['us_eval'][res] for res in ALL_RESULTS[model][eval_mode]['us_eval']]
         try:
             accuracy = median([results['accuracy'] for results in results_list])
-        except:
-            accuracy = -1
-        res = {
-            "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
-            "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
-            "Accuracy": accuracy,
-        }
-        df_list.append(res)
     df = pd.DataFrame(df_list)
@@ -567,26 +525,21 @@ def get_data_cn_eval(eval_mode='zero_shot', fillna=True, rank=True):
     df_list = []
     for model in MODEL_LIST:
-        results_list = [ALL_RESULTS[model][eval_mode]['cn_eval'][res] for res in ALL_RESULTS[model][eval_mode]['cn_eval']]
         try:
             accuracy = median([results['accuracy'] for results in results_list])
-        except:
-            accuracy = -1
-        res = {
-            "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
-            "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
-            "Accuracy": accuracy,
-        }
-        df_list.append(res)
     df = pd.DataFrame(df_list)
     # If there are any models that are the same, merge them
@@ -606,7 +559,6 @@ def get_data_cn_eval(eval_mode='zero_shot', fillna=True, rank=True):
     return df
 CN_EVAL_ZERO_SHOT = get_data_cn_eval(eval_mode="zero_shot")
 CN_EVAL_FIVE_SHOT = get_data_cn_eval(eval_mode="five_shot")
@@ -614,7 +566,6 @@ CN_EVAL_FIVE_SHOT = get_data_cn_eval(eval_mode="five_shot")
 # =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
 # =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
 def get_data_ph_eval(eval_mode='zero_shot', fillna=True, rank=True):
     df_list = []
@@ -622,23 +573,21 @@ def get_data_ph_eval(eval_mode='zero_shot', fillna=True, rank=True):
     for model in MODEL_LIST:
-        results_list = [ALL_RESULTS[model][eval_mode]['ph_eval'][res] for res in ALL_RESULTS[model][eval_mode]['ph_eval']]
         try:
             accuracy = median([results['accuracy'] for results in results_list])
-        except:
-            accuracy = -1
-        res = {
-            "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
-            "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
-            "Accuracy": accuracy,
-        }
-        df_list.append(res)
     df = pd.DataFrame(df_list)
@@ -673,25 +622,21 @@ def get_data_sing2eng(eval_mode='zero_shot', fillna=True, rank=True):
     df_list = []
     for model in MODEL_LIST:
-        results_list = [ALL_RESULTS[model][eval_mode]['sing2eng'][res] for res in ALL_RESULTS[model][eval_mode]['sing2eng']]
         try:
             bleu_score = median([results['bleu_score'] for results in results_list])
-        except:
-            bleu_score = -1
-        res = {
-            "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
-            "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
-            "BLEU": bleu_score,
-        }
-        df_list.append(res)
     df = pd.DataFrame(df_list)
@@ -725,25 +670,21 @@ def get_data_flores_ind2eng(eval_mode='zero_shot', fillna=True, rank=True):
     df_list = []
     for model in MODEL_LIST:
-        results_list = [ALL_RESULTS[model][eval_mode]['flores_ind2eng'][res] for res in ALL_RESULTS[model][eval_mode]['flores_ind2eng']]
         try:
             bleu_score = median([results['bleu_score'] for results in results_list])
-        except:
-            bleu_score = -1
-        res = {
-            "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
-            "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
-            "BLEU": bleu_score,
-        }
-        df_list.append(res)
     df = pd.DataFrame(df_list)
@@ -779,26 +720,21 @@ def get_data_flores_vie2eng(eval_mode='zero_shot', fillna=True, rank=True):
     df_list = []
     for model in MODEL_LIST:
-        results_list = [ALL_RESULTS[model][eval_mode]['flores_vie2eng'][res] for res in ALL_RESULTS[model][eval_mode]['flores_vie2eng']]
         try:
             bleu_score = median([results['bleu_score'] for results in results_list])
-        except:
-            bleu_score = -1
-        res = {
-            "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
-            "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
-            "BLEU": bleu_score,
-        }
-        df_list.append(res)
     df = pd.DataFrame(df_list)
     # If there are any models that are the same, merge them
@@ -831,26 +767,21 @@ def get_data_flores_zho2eng(eval_mode='zero_shot', fillna=True, rank=True):
     df_list = []
     for model in MODEL_LIST:
-        results_list = [ALL_RESULTS[model][eval_mode]['flores_zho2eng'][res] for res in ALL_RESULTS[model][eval_mode]['flores_zho2eng']]
         try:
             bleu_score = median([results['bleu_score'] for results in results_list])
-        except:
-            bleu_score = -1
-        res = {
-            "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
-            "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
-            "BLEU": bleu_score,
-        }
-        df_list.append(res)
     df = pd.DataFrame(df_list)
     # If there are any models that are the same, merge them
@@ -870,7 +801,6 @@ def get_data_flores_zho2eng(eval_mode='zero_shot', fillna=True, rank=True):
     return df
 FLORES_ZHO2ENG_ZERO_SHOT = get_data_flores_zho2eng(eval_mode="zero_shot")
 FLORES_ZHO2ENG_FIVE_SHOT = get_data_flores_zho2eng(eval_mode="five_shot")
@@ -884,26 +814,20 @@ def get_data_flores_zsm2eng(eval_mode='zero_shot', fillna=True, rank=True):
     df_list = []
     for model in MODEL_LIST:
-        results_list = [ALL_RESULTS[model][eval_mode]['flores_zsm2eng'][res] for res in ALL_RESULTS[model][eval_mode]['flores_zsm2eng']]
         try:
             bleu_score = median([results['bleu_score'] for results in results_list])
-        except:
-            bleu_score = -1
-        res = {
-            "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
-            "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
-            "BLEU": bleu_score,
-        }
-        df_list.append(res)
     df = pd.DataFrame(df_list)
     # If there are any models that are the same, merge them
@@ -923,7 +847,6 @@ def get_data_flores_zsm2eng(eval_mode='zero_shot', fillna=True, rank=True):
     return df
 FLORES_ZSM2ENG_ZERO_SHOT = get_data_flores_zho2eng(eval_mode="zero_shot")
 FLORES_ZSM2ENG_FIVE_SHOT = get_data_flores_zho2eng(eval_mode="five_shot")
@@ -937,27 +860,21 @@ def get_data_mmlu(eval_mode='zero_shot', fillna=True, rank=True):
     df_list = []
     for model in MODEL_LIST:
-        results_list = [ALL_RESULTS[model][eval_mode]['mmlu'][res] for res in ALL_RESULTS[model][eval_mode]['mmlu']]
         try:
             accuracy = median([results['accuracy'] for results in results_list])
         except:
             accuracy = -1
-        res = {
-            "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
-            "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
-            "Accuracy": accuracy,
-        }
-        df_list.append(res)
     df = pd.DataFrame(df_list)
     # If there are any models that are the same, merge them
     # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
@@ -984,32 +901,26 @@ MMLU_FIVE_SHOT = get_data_mmlu(eval_mode="five_shot")
 # =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
 # =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
 def get_data_mmlu_full(eval_mode='zero_shot', fillna=True, rank=True):
     df_list = []
     for model in MODEL_LIST:
-        results_list = [ALL_RESULTS[model][eval_mode]['mmlu_full'][res] for res in ALL_RESULTS[model][eval_mode]['mmlu_full']]
         try:
             accuracy = median([results['accuracy'] for results in results_list])
-        except:
-            accuracy = -1
-        res = {
-            "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
-            "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
-            "Accuracy": accuracy,
-        }
-        df_list.append(res)
     df = pd.DataFrame(df_list)
@@ -1030,40 +941,31 @@ def get_data_mmlu_full(eval_mode='zero_shot', fillna=True, rank=True):
     return df
 MMLU_FULL_ZERO_SHOT = get_data_mmlu_full(eval_mode="zero_shot")
 MMLU_FULL_FIVE_SHOT = get_data_mmlu_full(eval_mode="five_shot")
 # =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
 # =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
 def get_data_c_eval(eval_mode='zero_shot', fillna=True, rank=True):
     df_list = []
-    for model in MODEL_LIST:
-        results_list = [ALL_RESULTS[model][eval_mode]['c_eval'][res] for res in ALL_RESULTS[model][eval_mode]['c_eval']]
         try:
             accuracy = median([results['accuracy'] for results in results_list])
-        except:
-            accuracy = -1
-        res = {
-            "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
-            "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
-            "Accuracy": accuracy,
-        }
-        df_list.append(res)
     df = pd.DataFrame(df_list)
     # If there are any models that are the same, merge them
@@ -1083,7 +985,6 @@ def get_data_c_eval(eval_mode='zero_shot', fillna=True, rank=True):
     return df
 C_EVAL_ZERO_SHOT = get_data_c_eval(eval_mode="zero_shot")
 C_EVAL_FIVE_SHOT = get_data_c_eval(eval_mode="five_shot")
@@ -1097,25 +998,23 @@ def get_data_c_eval_full(eval_mode='zero_shot', fillna=True, rank=True):
     df_list = []
     for model in MODEL_LIST:
-        results_list = [ALL_RESULTS[model][eval_mode]['c_eval_full'][res] for res in ALL_RESULTS[model][eval_mode]['c_eval_full']]
         try:
             accuracy = median([results['accuracy'] for results in results_list])
-        except:
-            accuracy = -1
-        res = {
-            "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
-            "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
-            "Accuracy": accuracy,
-        }
-        df_list.append(res)
     df = pd.DataFrame(df_list)
@@ -1152,25 +1051,24 @@ def get_data_cmmlu(eval_mode='zero_shot', fillna=True, rank=True):
     df_list = []
     for model in MODEL_LIST:
-        results_list = [ALL_RESULTS[model][eval_mode]['cmmlu'][res] for res in ALL_RESULTS[model][eval_mode]['cmmlu']]
         try:
             accuracy = median([results['accuracy'] for results in results_list])
         except:
-            accuracy = -1
-        res = {
-            "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
-            "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
-            "Accuracy": accuracy,
-        }
-        df_list.append(res)
     df = pd.DataFrame(df_list)
@@ -1197,9 +1095,6 @@ CMMLU_FIVE_SHOT = get_data_cmmlu(eval_mode="five_shot")
 # =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
 # =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
@@ -1209,25 +1104,24 @@ def get_data_cmmlu_full(eval_mode='zero_shot', fillna=True, rank=True):
     df_list = []
     for model in MODEL_LIST:
-        results_list = [ALL_RESULTS[model][eval_mode]['cmmlu_full'][res] for res in ALL_RESULTS[model][eval_mode]['cmmlu_full']]
         try:
             accuracy = median([results['accuracy'] for results in results_list])
         except:
-            accuracy = -1
-        res = {
-            "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
-            "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
-            "Accuracy": accuracy,
-        }
-        df_list.append(res)
     df = pd.DataFrame(df_list)
@@ -1263,25 +1157,20 @@ def get_data_zbench(eval_mode='zero_shot', fillna=True, rank=True):
     df_list = []
     for model in MODEL_LIST:
-        results_list = [ALL_RESULTS[model][eval_mode]['zbench'][res] for res in ALL_RESULTS[model][eval_mode]['zbench']]
         try:
             accuracy = median([results['accuracy'] for results in results_list])
-        except:
-            accuracy = -1
-        res = {
-            "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
-            "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
-            "Accuracy": accuracy,
-        }
-        df_list.append(res)
     df = pd.DataFrame(df_list)
@@ -1316,21 +1205,23 @@ def get_data_indommlu(eval_mode='zero_shot', fillna=True, rank=True):
     for model in MODEL_LIST:
-        results_list = [ALL_RESULTS[model][eval_mode]['indommlu'][res] for res in ALL_RESULTS[model][eval_mode]['indommlu']]
         try:
             accuracy = median([results['accuracy'] for results in results_list])
         except:
-            accuracy = -1
-        res = {
-            "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
-            "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
-            "Accuracy": accuracy,
-        }
-        df_list.append(res)
     df = pd.DataFrame(df_list)
@@ -1358,33 +1249,25 @@ INDOMMLU_FIVE_SHOT = get_data_indommlu(eval_mode="five_shot")
 # =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
 # =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
 def get_data_ind_emotion(eval_mode='zero_shot', fillna=True, rank=True):
     df_list = []
     for model in MODEL_LIST:
-        results_list = [ALL_RESULTS[model][eval_mode]['ind_emotion'][res] for res in ALL_RESULTS[model][eval_mode]['ind_emotion']]
         try:
             accuracy = median([results['accuracy'] for results in results_list])
-        except:
-            accuracy = -1
-        res = {
-            "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
-            "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
-            "Accuracy": accuracy,
-        }
-        df_list.append(res)
     df = pd.DataFrame(df_list)
     # If there are any models that are the same, merge them
@@ -1404,7 +1287,6 @@ def get_data_ind_emotion(eval_mode='zero_shot', fillna=True, rank=True):
     return df
 IND_EMOTION_ZERO_SHOT = get_data_ind_emotion(eval_mode="zero_shot")
 IND_EMOTION_FIVE_SHOT = get_data_ind_emotion(eval_mode="five_shot")
@@ -1420,25 +1302,21 @@ def get_data_ocnli(eval_mode='zero_shot', fillna=True, rank=True):
     df_list = []
     for model in MODEL_LIST:
-        results_list = [ALL_RESULTS[model][eval_mode]['ocnli'][res] for res in ALL_RESULTS[model][eval_mode]['ocnli']]
         try:
             accuracy = median([results['accuracy'] for results in results_list])
-        except:
-            accuracy = -1
-        res = {
-            "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
-            "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
-            "Accuracy": accuracy,
-        }
-        df_list.append(res)
     df = pd.DataFrame(df_list)
@@ -1474,26 +1352,21 @@ def get_data_c3(eval_mode='zero_shot', fillna=True, rank=True):
     df_list = []
     for model in MODEL_LIST:
-        results_list = [ALL_RESULTS[model][eval_mode]['c3'][res] for res in ALL_RESULTS[model][eval_mode]['c3']]
         try:
             accuracy = median([results['accuracy'] for results in results_list])
-        except:
-            accuracy = -1
-        res = {
-            "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
-            "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
-            "Accuracy": accuracy,
-        }
-        df_list.append(res)
     df = pd.DataFrame(df_list)
     # If there are any models that are the same, merge them
@@ -1528,25 +1401,21 @@ def get_data_dream(eval_mode='zero_shot', fillna=True, rank=True):
     df_list = []
     for model in MODEL_LIST:
-        results_list = [ALL_RESULTS[model][eval_mode]['dream'][res] for res in ALL_RESULTS[model][eval_mode]['dream']]
         try:
             accuracy = median([results['accuracy'] for results in results_list])
-        except:
-            accuracy = -1
-        res = {
-            "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
-            "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
-            "Accuracy": accuracy,
-        }
-        df_list.append(res)
     df = pd.DataFrame(df_list)
@@ -1567,47 +1436,36 @@ def get_data_dream(eval_mode='zero_shot', fillna=True, rank=True):
     return df
 DREAM_ZERO_SHOT = get_data_dream(eval_mode="zero_shot")
 DREAM_FIVE_SHOT = get_data_dream(eval_mode="five_shot")
 # =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
 # =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
 def get_data_samsum(eval_mode='zero_shot', fillna=True, rank=True):
     df_list = []
     for model in MODEL_LIST:
-        results_list = [ALL_RESULTS[model][eval_mode]['samsum'][res] for res in ALL_RESULTS[model][eval_mode]['samsum']]
         try:
             rouge1 = median([results['rouge1'] for results in results_list])
             rouge2 = median([results['rouge2'] for results in results_list])
             rougeL = median([results['rougeL'] for results in results_list])
-        except:
-            rouge1 = -1
-            rouge2 = -1
-            rougeL = -1
-        res = {
-            "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
-            "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
-            "ROUGE-1": rouge1,
-            "ROUGE-2": rouge2,
-            "ROUGE-L": rougeL,
-        }
-        df_list.append(res)
     df = pd.DataFrame(df_list)
     # If there are any models that are the same, merge them
@@ -1641,31 +1499,29 @@ def get_data_dialogsum(eval_mode='zero_shot', fillna=True, rank=True):
     df_list = []
     for model in MODEL_LIST:
-        results_list = [ALL_RESULTS[model][eval_mode]['dialogsum'][res] for res in ALL_RESULTS[model][eval_mode]['dialogsum']]
         try:
             rouge1 = median([results['rouge1'] for results in results_list])
             rouge2 = median([results['rouge2'] for results in results_list])
             rougeL = median([results['rougeL'] for results in results_list])
         except:
-            rouge1 = -1
-            rouge2 = -1
-            rougeL = -1
-        res = {
-            "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
-            "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
-            "ROUGE-1": rouge1,
-            "ROUGE-2": rouge2,
-            "ROUGE-L": rougeL,
-        }
-        df_list.append(res)
     df = pd.DataFrame(df_list)
@@ -1703,24 +1559,23 @@ def get_data_sst2(eval_mode='zero_shot', fillna=True, rank=True):
     for model in MODEL_LIST:
-        results_list = [ALL_RESULTS[model][eval_mode]['sst2'][res] for res in ALL_RESULTS[model][eval_mode]['sst2']]
         try:
             accuracy = median([results['accuracy'] for results in results_list])
         except:
-            accuracy = -1
-        res = {
-            "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
-            "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
-            "Accuracy": accuracy,
-        }
-        df_list.append(res)
     df = pd.DataFrame(df_list)
@@ -1757,26 +1612,21 @@ def get_data_cola(eval_mode='zero_shot', fillna=True, rank=True):
     df_list = []
     for model in MODEL_LIST:
-        results_list = [ALL_RESULTS[model][eval_mode]['cola'][res] for res in ALL_RESULTS[model][eval_mode]['cola']]
         try:
             accuracy = median([results['accuracy'] for results in results_list])
-        except:
-            accuracy = -1
-        res = {
-            "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
-            "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
-            "Accuracy": accuracy,
-        }
-        df_list.append(res)
     df = pd.DataFrame(df_list)
     # If there are any models that are the same, merge them
@@ -1814,24 +1664,20 @@ def get_data_qqp(eval_mode='zero_shot', fillna=True, rank=True):
     for model in MODEL_LIST:
-        results_list = [ALL_RESULTS[model][eval_mode]['qqp'][res] for res in ALL_RESULTS[model][eval_mode]['qqp']]
         try:
             accuracy = median([results['accuracy'] for results in results_list])
-        except:
-            accuracy = -1
-        res = {
-            "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
-            "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
-            "Accuracy": accuracy,
-        }
-        df_list.append(res)
     df = pd.DataFrame(df_list)
@@ -1869,25 +1715,21 @@ def get_data_mnli(eval_mode='zero_shot', fillna=True, rank=True):
     df_list = []
     for model in MODEL_LIST:
-        results_list = [ALL_RESULTS[model][eval_mode]['mnli'][res] for res in ALL_RESULTS[model][eval_mode]['mnli']]
         try:
             accuracy = median([results['accuracy'] for results in results_list])
-        except:
-            accuracy = -1
-        res = {
-            "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
-            "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
-            "Accuracy": accuracy,
-        }
-        df_list.append(res)
     df = pd.DataFrame(df_list)
@@ -1925,26 +1767,21 @@ def get_data_qnli(eval_mode='zero_shot', fillna=True, rank=True):
     df_list = []
     for model in MODEL_LIST:
-        results_list = [ALL_RESULTS[model][eval_mode]['qnli'][res] for res in ALL_RESULTS[model][eval_mode]['qnli']]
         try:
             accuracy = median([results['accuracy'] for results in results_list])
-        except:
-            accuracy = -1
-        res = {
-            "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
-            "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
-            "Accuracy": accuracy,
-        }
-        df_list.append(res)
     df = pd.DataFrame(df_list)
     # If there are any models that are the same, merge them
@@ -1981,26 +1818,21 @@ def get_data_wnli(eval_mode='zero_shot', fillna=True, rank=True):
     df_list = []
     for model in MODEL_LIST:
-        results_list = [ALL_RESULTS[model][eval_mode]['wnli'][res] for res in ALL_RESULTS[model][eval_mode]['wnli']]
         try:
             accuracy = median([results['accuracy'] for results in results_list])
-        except:
-            accuracy = -1
-        res = {
-            "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
-            "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
-            "Accuracy": accuracy,
-        }
-        df_list.append(res)
     df = pd.DataFrame(df_list)
     # If there are any models that are the same, merge them
@@ -2020,14 +1852,10 @@ def get_data_wnli(eval_mode='zero_shot', fillna=True, rank=True):
     return df
 WNLI_ZERO_SHOT = get_data_wnli(eval_mode="zero_shot")
 WNLI_FIVE_SHOT = get_data_wnli(eval_mode="five_shot")
 # =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
 # =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
@@ -2037,26 +1865,20 @@ def get_data_rte(eval_mode='zero_shot', fillna=True, rank=True):
     df_list = []
     for model in MODEL_LIST:
-        results_list = [ALL_RESULTS[model][eval_mode]['rte'][res] for res in ALL_RESULTS[model][eval_mode]['rte']]
         try:
             accuracy = median([results['accuracy'] for results in results_list])
-        except:
-            accuracy = -1
-        res = {
-            "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
-            "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
-            "Accuracy": accuracy,
-        }
-        df_list.append(res)
     df = pd.DataFrame(df_list)
     # If there are any models that are the same, merge them
@@ -2081,39 +1903,28 @@ RTE_ZERO_SHOT = get_data_rte(eval_mode="zero_shot")
 RTE_FIVE_SHOT = get_data_rte(eval_mode="five_shot")
 # =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
 # =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
 def get_data_mrpc(eval_mode='zero_shot', fillna=True, rank=True):
     df_list = []
     for model in MODEL_LIST:
-        results_list = [ALL_RESULTS[model][eval_mode]['mrpc'][res] for res in ALL_RESULTS[model][eval_mode]['mrpc']]
         try:
             accuracy = median([results['accuracy'] for results in results_list])
-        except:
-            accuracy = -1
-        res = {
-            "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
-            "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
-            "Accuracy": accuracy,
-        }
-        df_list.append(res)
     df = pd.DataFrame(df_list)
     # If there are any models that are the same, merge them
@@ -2210,8 +2021,8 @@ with block:
     - **Mode of Evaluation**: Zero-Shot, Five-Shot
     ### The following table shows the performance of the models on the SeaEval benchmark.
-    - For **Zero-shot** performance, it is the median value from 5 distinct prompts shown on the above leaderboard to mitigate the influence of random variations induced by prompts.
-    - (-1) value indicates the results are ready yet.
     - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
     """)
@@ -2348,7 +2159,7 @@ with block:
-        with gr.TabItem("Cultural Reasoning and Understanding"):
             # dataset 3: SG_EVAL
             with gr.TabItem("SG_EVAL"):
@@ -2697,7 +2508,7 @@ with block:
                     """)
-        with gr.TabItem("FLORES Translation"):
             # dataset 8:
@@ -2805,7 +2616,7 @@ with block:
                     """)
-        with gr.TabItem("Emotion Recognition"):
             # dataset 18:
             with gr.TabItem("ind_emotion"):
@@ -2941,7 +2752,7 @@ with block:
-        with gr.TabItem("Fundamental NLP"):
             # dataset

     df_list = []
     for model in MODEL_LIST:
         try:
+            results_list = [ALL_RESULTS[model][eval_mode]['cross_xquad'][res] for res in ALL_RESULTS[model][eval_mode]['cross_xquad']]
             overall_acc = [results['overall_acc'] for results in results_list]
             overall_acc = median(overall_acc)
             AC3_3 = [results['AC3_3'] for results in results_list]
             AC3_3 = median(AC3_3)
+            res = {
+                "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
+                "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
+                "Accuracy": overall_acc,
+                "Cross-Lingual Consistency": consistency_score_3,
+                "AC3": AC3_3,
+            }
+            df_list.append(res)
+        except:
+            print('Not found in model: {} for {}'.format(model, "cross_xquad_overall"))
     df = pd.DataFrame(df_list)
     return df
 CROSS_XQUAD_ZERO_SHOT_OVERALL = get_data_cross_xquad_overall(eval_mode="zero_shot")
 CROSS_XQUAD_FIVE_SHOT_OVERALL = get_data_cross_xquad_overall(eval_mode="five_shot")
     df_list = []
     for model in MODEL_LIST:
         try:
+            results_list = [ALL_RESULTS[model][eval_mode]['cross_xquad'][res] for res in ALL_RESULTS[model][eval_mode]['cross_xquad']]
             English    = [results['language_acc']['English'] for results in results_list]
             Vietnamese = [results['language_acc']['Vietnamese'] for results in results_list]
             Chinese    = [results['language_acc']['Chinese'] for results in results_list]
             Chinese    = median(Chinese)
             Spanish    = median(Spanish)
+            res = {
+                "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
+                "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
+                "English": English,
+                "Vietnamese": Vietnamese,
+                "Chinese": Chinese,
+                "Spanish": Spanish,
+            }
+            df_list.append(res)
+        except:
+            print('Not found in model: {} for {}'.format(model, "cross_xquad_lang"))
     df = pd.DataFrame(df_list)
     return df
 CROSS_XQUAD_ZERO_SHOT_LANGUAGE = get_data_cross_xquad_language(eval_mode="zero_shot")
 CROSS_XQUAD_FIVE_SHOT_LANGUAGE = get_data_cross_xquad_language(eval_mode="five_shot")
     df_list = []
     for model in MODEL_LIST:
         try:
+            results_list = [ALL_RESULTS[model][eval_mode]['cross_mmlu'][res] for res in ALL_RESULTS[model][eval_mode]['cross_mmlu']]
             overall_acc = [results['overall_acc'] for results in results_list]
             overall_acc = median(overall_acc)
             AC3_3 = [results['AC3_3'] for results in results_list]
             AC3_3 = median(AC3_3)
+            res = {
+                "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
+                "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
+                "Accuracy": overall_acc,
+                "Cross-Lingual Consistency": consistency_score_3,
+                "AC3": AC3_3,
+            }
+            df_list.append(res)
+        except:
+            print('Not found in model: {} for {}'.format(model, "cross_mmlu_overall"))
     df = pd.DataFrame(df_list)
     return df
 CROSS_MMLU_ZERO_SHOT_OVERALL = get_data_cross_mmlu_overall(eval_mode="zero_shot")
 CROSS_MMLU_FIVE_SHOT_OVERALL = get_data_cross_mmlu_overall(eval_mode="five_shot")
     df_list = []
     for model in MODEL_LIST:
+        try:
+            results_list = [ALL_RESULTS[model][eval_mode]['cross_mmlu'][res] for res in ALL_RESULTS[model][eval_mode]['cross_mmlu']]
             English    = [results['language_acc']['English'] for results in results_list]
             Vietnamese = [results['language_acc']['Vietnamese'] for results in results_list]
             Chinese    = [results['language_acc']['Chinese'] for results in results_list]
             Spanish    = median(Spanish)
             Malay      = median(Malay)
+            res = {
+                "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
+                "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
+                "English": English,
+                "Vietnamese": Vietnamese,
+                "Chinese": Chinese,
+                "Indonesian": Indonesian,
+                "Filipino": Filipino,
+                "Spanish": Spanish,
+                "Malay": Malay,
+            }
+            df_list.append(res)
+        except:
+            print('Not found in model: {} for {}'.format(model, "cross_mmlu_lang"))
     df = pd.DataFrame(df_list)
     # If there are any models that are the same, merge them
     return df
 CROSS_MMLU_ZERO_SHOT_LANGUAGE = get_data_cross_mmlu_language(eval_mode="zero_shot")
 CROSS_MMLU_FIVE_SHOT_LANGUAGE = get_data_cross_mmlu_language(eval_mode="five_shot")
     df_list = []
     for model in MODEL_LIST:
+        try:
+            results_list = [ALL_RESULTS[model][eval_mode]['cross_logiqa'][res] for res in ALL_RESULTS[model][eval_mode]['cross_logiqa']]
             overall_acc = [results['overall_acc'] for results in results_list]
             overall_acc = median(overall_acc)
             AC3_3 = [results['AC3_3'] for results in results_list]
             AC3_3 = median(AC3_3)
+            res = {
+                "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
+                "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
+                "Accuracy": overall_acc,
+                "Cross-Lingual Consistency": consistency_score_3,
+                "AC3": AC3_3,
+            }
+            df_list.append(res)
+        except:
+            print('Not found in model: {} for {}'.format(model, "cross_logiqa_overall"))
     df = pd.DataFrame(df_list)
     df_list = []
     for model in MODEL_LIST:
+        try:
+            results_list = [ALL_RESULTS[model][eval_mode]['cross_logiqa'][res] for res in ALL_RESULTS[model][eval_mode]['cross_logiqa']]
             English    = [results['language_acc']['English'] for results in results_list]
             Vietnamese = [results['language_acc']['Vietnamese'] for results in results_list]
             Chinese    = [results['language_acc']['Chinese'] for results in results_list]
             Spanish    = median(Spanish)
             Malay      = median(Malay)
+            res = {
+                "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
+                "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
+                "English": English,
+                "Vietnamese": Vietnamese,
+                "Chinese": Chinese,
+                "Indonesian": Indonesian,
+                "Filipino": Filipino,
+                "Spanish": Spanish,
+                "Malay": Malay,
+            }
+            df_list.append(res)
+        except:
+            print('Not found in model: {} for {}'.format(model, "cross_logiqa_language"))
     df = pd.DataFrame(df_list)
     # If there are any models that are the same, merge them
     df_list = []
     for model in MODEL_LIST:
         try:
+            results_list = [ALL_RESULTS[model][eval_mode]['sg_eval'][res] for res in ALL_RESULTS[model][eval_mode]['sg_eval']]
             accuracy = median([results['accuracy'] for results in results_list])
+            res = {
+                "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
+                "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
+                "Accuracy": accuracy,
+            }
+            df_list.append(res)
+        except:
+            print('Not found in model: {} for {}'.format(model, "sg_eval"))
     df = pd.DataFrame(df_list)
     for model in MODEL_LIST:
         try:
+            results_list = [ALL_RESULTS[model][eval_mode]['us_eval'][res] for res in ALL_RESULTS[model][eval_mode]['us_eval']]
             accuracy = median([results['accuracy'] for results in results_list])
+            res = {
+                "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
+                "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
+                "Accuracy": accuracy,
+            }
+            df_list.append(res)
+        except:
+            print('Not found in model: {} for {}'.format(model, "us_eval"))
     df = pd.DataFrame(df_list)
     df_list = []
     for model in MODEL_LIST:
         try:
+            results_list = [ALL_RESULTS[model][eval_mode]['cn_eval'][res] for res in ALL_RESULTS[model][eval_mode]['cn_eval']]
             accuracy = median([results['accuracy'] for results in results_list])
+            res = {
+                "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
+                "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
+                "Accuracy": accuracy,
+            }
+            df_list.append(res)
+        except:
+            print('Not found in model: {} for {}'.format(model, "cn_eval"))
     df = pd.DataFrame(df_list)
     # If there are any models that are the same, merge them
     return df
 CN_EVAL_ZERO_SHOT = get_data_cn_eval(eval_mode="zero_shot")
 CN_EVAL_FIVE_SHOT = get_data_cn_eval(eval_mode="five_shot")
 # =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
 # =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
 def get_data_ph_eval(eval_mode='zero_shot', fillna=True, rank=True):
     df_list = []
     for model in MODEL_LIST:
         try:
+            results_list = [ALL_RESULTS[model][eval_mode]['ph_eval'][res] for res in ALL_RESULTS[model][eval_mode]['ph_eval']]
             accuracy = median([results['accuracy'] for results in results_list])
+            res = {
+                "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
+                "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
+                "Accuracy": accuracy,
+            }
+            df_list.append(res)
+        except:
+            print('Not found in model: {} for {}'.format(model, "ph_eval"))
     df = pd.DataFrame(df_list)
     df_list = []
     for model in MODEL_LIST:
         try:
+            results_list = [ALL_RESULTS[model][eval_mode]['sing2eng'][res] for res in ALL_RESULTS[model][eval_mode]['sing2eng']]
             bleu_score = median([results['bleu_score'] for results in results_list])
+            res = {
+                "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
+                "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
+                "BLEU": bleu_score,
+            }
+            df_list.append(res)
+        except:
+            print('Not found in model: {} for {}'.format(model, "sing2eng"))
     df = pd.DataFrame(df_list)
     df_list = []
     for model in MODEL_LIST:
         try:
+            results_list = [ALL_RESULTS[model][eval_mode]['flores_ind2eng'][res] for res in ALL_RESULTS[model][eval_mode]['flores_ind2eng']]
             bleu_score = median([results['bleu_score'] for results in results_list])
+            res = {
+                "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
+                "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
+                "BLEU": bleu_score,
+            }
+            df_list.append(res)
+        except:
+            print('Not found in model: {} for {}'.format(model, "flores_ind2eng"))
     df = pd.DataFrame(df_list)
     df_list = []
     for model in MODEL_LIST:
         try:
+            results_list = [ALL_RESULTS[model][eval_mode]['flores_vie2eng'][res] for res in ALL_RESULTS[model][eval_mode]['flores_vie2eng']]
             bleu_score = median([results['bleu_score'] for results in results_list])
+            res = {
+                "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
+                "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
+                "BLEU": bleu_score,
+            }
+            df_list.append(res)
+        except:
+            print('Not found in model: {} for {}'.format(model, "flores_vie2eng"))
     df = pd.DataFrame(df_list)
     # If there are any models that are the same, merge them
     df_list = []
     for model in MODEL_LIST:
         try:
+            results_list = [ALL_RESULTS[model][eval_mode]['flores_zho2eng'][res] for res in ALL_RESULTS[model][eval_mode]['flores_zho2eng']]
             bleu_score = median([results['bleu_score'] for results in results_list])
+            res = {
+                "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
+                "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
+                "BLEU": bleu_score,
+            }
+            df_list.append(res)
+        except:
+            print('Not found in model: {} for {}'.format(model, "flores_zho2eng"))
     df = pd.DataFrame(df_list)
     # If there are any models that are the same, merge them
     return df
 FLORES_ZHO2ENG_ZERO_SHOT = get_data_flores_zho2eng(eval_mode="zero_shot")
 FLORES_ZHO2ENG_FIVE_SHOT = get_data_flores_zho2eng(eval_mode="five_shot")
     df_list = []
     for model in MODEL_LIST:
         try:
+            results_list = [ALL_RESULTS[model][eval_mode]['flores_zsm2eng'][res] for res in ALL_RESULTS[model][eval_mode]['flores_zsm2eng']]
             bleu_score = median([results['bleu_score'] for results in results_list])
+            res = {
+                "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
+                "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
+                "BLEU": bleu_score,
+            }
+            df_list.append(res)
+        except:
+            print('Not found in model: {} for {}'.format(model, "flores_zsm2eng"))
     df = pd.DataFrame(df_list)
     # If there are any models that are the same, merge them
     return df
 FLORES_ZSM2ENG_ZERO_SHOT = get_data_flores_zho2eng(eval_mode="zero_shot")
 FLORES_ZSM2ENG_FIVE_SHOT = get_data_flores_zho2eng(eval_mode="five_shot")
     df_list = []
     for model in MODEL_LIST:
         try:
+            results_list = [ALL_RESULTS[model][eval_mode]['mmlu'][res] for res in ALL_RESULTS[model][eval_mode]['mmlu']]
             accuracy = median([results['accuracy'] for results in results_list])
+            res = {
+                "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
+                "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
+                "Accuracy": accuracy,
+            }
+            df_list.append(res)
         except:
             accuracy = -1
     df = pd.DataFrame(df_list)
     # If there are any models that are the same, merge them
     # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
 # =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
 # =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
 def get_data_mmlu_full(eval_mode='zero_shot', fillna=True, rank=True):
     df_list = []
     for model in MODEL_LIST:
         try:
+            results_list = [ALL_RESULTS[model][eval_mode]['mmlu_full'][res] for res in ALL_RESULTS[model][eval_mode]['mmlu_full']]
             accuracy = median([results['accuracy'] for results in results_list])
+            res = {
+                "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
+                "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
+                "Accuracy": accuracy,
+            }
+            df_list.append(res)
+        except:
+            print('Not found in model: {} for {}'.format(model, "mmlu_full"))
     df = pd.DataFrame(df_list)
     return df
 MMLU_FULL_ZERO_SHOT = get_data_mmlu_full(eval_mode="zero_shot")
 MMLU_FULL_FIVE_SHOT = get_data_mmlu_full(eval_mode="five_shot")
 # =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
 # =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
 def get_data_c_eval(eval_mode='zero_shot', fillna=True, rank=True):
     df_list = []
+    for model in MODEL_LIST:
         try:
+            results_list = [ALL_RESULTS[model][eval_mode]['c_eval'][res] for res in ALL_RESULTS[model][eval_mode]['c_eval']]
             accuracy = median([results['accuracy'] for results in results_list])
+            res = {
+                "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
+                "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
+                "Accuracy": accuracy,
+            }
+            df_list.append(res)
+        except:
+            print('Not found in model: {} for {}'.format(model, "c_eval"))
     df = pd.DataFrame(df_list)
     # If there are any models that are the same, merge them
     return df
 C_EVAL_ZERO_SHOT = get_data_c_eval(eval_mode="zero_shot")
 C_EVAL_FIVE_SHOT = get_data_c_eval(eval_mode="five_shot")
     df_list = []
     for model in MODEL_LIST:
         try:
+            results_list = [ALL_RESULTS[model][eval_mode]['c_eval_full'][res] for res in ALL_RESULTS[model][eval_mode]['c_eval_full']]
             accuracy = median([results['accuracy'] for results in results_list])
+            res = {
+                "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
+                "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
+                "Accuracy": accuracy,
+            }
+            df_list.append(res)
+        except:
+            print('Not found in model: {} for {}'.format(model, "c_eval_full"))
     df = pd.DataFrame(df_list)
     df_list = []
     for model in MODEL_LIST:
         try:
+            results_list = [ALL_RESULTS[model][eval_mode]['cmmlu'][res] for res in ALL_RESULTS[model][eval_mode]['cmmlu']]
             accuracy = median([results['accuracy'] for results in results_list])
+            res = {
+                "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
+                "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
+                "Accuracy": accuracy,
+            }
+            df_list.append(res)
         except:
+            print('Not found in model: {} for {}'.format(model, "cmmlu"))
     df = pd.DataFrame(df_list)
 # =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
 # =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
     df_list = []
     for model in MODEL_LIST:
         try:
+            results_list = [ALL_RESULTS[model][eval_mode]['cmmlu_full'][res] for res in ALL_RESULTS[model][eval_mode]['cmmlu_full']]
             accuracy = median([results['accuracy'] for results in results_list])
+            res = {
+                "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
+                "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
+                "Accuracy": accuracy,
+            }
+            df_list.append(res)
         except:
+            print('Not found in model: {} for {}'.format(model, "cmmlu_full"))
     df = pd.DataFrame(df_list)
     df_list = []
     for model in MODEL_LIST:
         try:
+            results_list = [ALL_RESULTS[model][eval_mode]['zbench'][res] for res in ALL_RESULTS[model][eval_mode]['zbench']]
             accuracy = median([results['accuracy'] for results in results_list])
+            res = {
+                "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
+                "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
+                "Accuracy": accuracy,
+            }
+            df_list.append(res)
+        except:
+            print('Not found in model: {} for {}'.format(model, "zbench"))
     df = pd.DataFrame(df_list)
     for model in MODEL_LIST:
         try:
+            results_list = [ALL_RESULTS[model][eval_mode]['indommlu'][res] for res in ALL_RESULTS[model][eval_mode]['indommlu']]
             accuracy = median([results['accuracy'] for results in results_list])
+            res = {
+                "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
+                "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
+                "Accuracy": accuracy,
+            }
+            df_list.append(res)
         except:
+            print('Not found in model: {} for {}'.format(model, "indommlu"))
     df = pd.DataFrame(df_list)
 # =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
 # =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
 def get_data_ind_emotion(eval_mode='zero_shot', fillna=True, rank=True):
     df_list = []
     for model in MODEL_LIST:
         try:
+            results_list = [ALL_RESULTS[model][eval_mode]['ind_emotion'][res] for res in ALL_RESULTS[model][eval_mode]['ind_emotion']]
             accuracy = median([results['accuracy'] for results in results_list])
+            res = {
+                "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
+                "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
+                "Accuracy": accuracy,
+            }
+            df_list.append(res)
+        except:
+            print('Not found in model: {} for {}'.format(model, "ind_emotion"))
     df = pd.DataFrame(df_list)
     # If there are any models that are the same, merge them
     return df
 IND_EMOTION_ZERO_SHOT = get_data_ind_emotion(eval_mode="zero_shot")
 IND_EMOTION_FIVE_SHOT = get_data_ind_emotion(eval_mode="five_shot")
     df_list = []
     for model in MODEL_LIST:
         try:
+            results_list = [ALL_RESULTS[model][eval_mode]['ocnli'][res] for res in ALL_RESULTS[model][eval_mode]['ocnli']]
             accuracy = median([results['accuracy'] for results in results_list])
+            res = {
+                "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
+                "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
+                "Accuracy": accuracy,
+            }
+            df_list.append(res)
+        except:
+            print('Not found in model: {} for {}'.format(model, "ocnli"))
     df = pd.DataFrame(df_list)
     df_list = []
     for model in MODEL_LIST:
         try:
+            results_list = [ALL_RESULTS[model][eval_mode]['c3'][res] for res in ALL_RESULTS[model][eval_mode]['c3']]
             accuracy = median([results['accuracy'] for results in results_list])
+            res = {
+                "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
+                "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
+                "Accuracy": accuracy,
+            }
+            df_list.append(res)
+        except:
+            print('Not found in model: {} for {}'.format(model, "c3"))
     df = pd.DataFrame(df_list)
     # If there are any models that are the same, merge them
     df_list = []
     for model in MODEL_LIST:
         try:
+            results_list = [ALL_RESULTS[model][eval_mode]['dream'][res] for res in ALL_RESULTS[model][eval_mode]['dream']]
             accuracy = median([results['accuracy'] for results in results_list])
+            res = {
+                "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
+                "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
+                "Accuracy": accuracy,
+            }
+            df_list.append(res)
+        except:
+            print('Not found in model: {} for {}'.format(model, "dream"))
     df = pd.DataFrame(df_list)
     return df
 DREAM_ZERO_SHOT = get_data_dream(eval_mode="zero_shot")
 DREAM_FIVE_SHOT = get_data_dream(eval_mode="five_shot")
 # =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
 # =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
 def get_data_samsum(eval_mode='zero_shot', fillna=True, rank=True):
     df_list = []
     for model in MODEL_LIST:
         try:
+            results_list = [ALL_RESULTS[model][eval_mode]['samsum'][res] for res in ALL_RESULTS[model][eval_mode]['samsum']]
             rouge1 = median([results['rouge1'] for results in results_list])
             rouge2 = median([results['rouge2'] for results in results_list])
             rougeL = median([results['rougeL'] for results in results_list])
+            res = {
+                "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
+                "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
+                "ROUGE-1": rouge1,
+                "ROUGE-2": rouge2,
+                "ROUGE-L": rougeL,
+            }
+            df_list.append(res)
+        except:
+            print('Not found in model: {} for {}'.format(model, "samsum"))
     df = pd.DataFrame(df_list)
     # If there are any models that are the same, merge them
     df_list = []
     for model in MODEL_LIST:
         try:
+            results_list = [ALL_RESULTS[model][eval_mode]['dialogsum'][res] for res in ALL_RESULTS[model][eval_mode]['dialogsum']]
             rouge1 = median([results['rouge1'] for results in results_list])
             rouge2 = median([results['rouge2'] for results in results_list])
             rougeL = median([results['rougeL'] for results in results_list])
+            res = {
+                "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
+                "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
+                "ROUGE-1": rouge1,
+                "ROUGE-2": rouge2,
+                "ROUGE-L": rougeL,
+            }
+            df_list.append(res)
         except:
+            print('Not found in model: {} for {}'.format(model, "dialogsum"))
     df = pd.DataFrame(df_list)
     for model in MODEL_LIST:
         try:
+            results_list = [ALL_RESULTS[model][eval_mode]['sst2'][res] for res in ALL_RESULTS[model][eval_mode]['sst2']]
             accuracy = median([results['accuracy'] for results in results_list])
+            res = {
+                "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
+                "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
+                "Accuracy": accuracy,
+            }
+            df_list.append(res)
         except:
+            print('Not found in model: {} for {}'.format(model, "sst2"))
     df = pd.DataFrame(df_list)
     df_list = []
     for model in MODEL_LIST:
         try:
+            results_list = [ALL_RESULTS[model][eval_mode]['cola'][res] for res in ALL_RESULTS[model][eval_mode]['cola']]
             accuracy = median([results['accuracy'] for results in results_list])
+            res = {
+                "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
+                "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
+                "Accuracy": accuracy,
+            }
+            df_list.append(res)
+        except:
+            print('Not found in model: {} for {}'.format(model, "cola"))
     df = pd.DataFrame(df_list)
     # If there are any models that are the same, merge them
     for model in MODEL_LIST:
         try:
+            results_list = [ALL_RESULTS[model][eval_mode]['qqp'][res] for res in ALL_RESULTS[model][eval_mode]['qqp']]
             accuracy = median([results['accuracy'] for results in results_list])
+            res = {
+                "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
+                "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
+                "Accuracy": accuracy,
+            }
+            df_list.append(res)
+        except:
+            print('Not found in model: {} for {}'.format(model, "qqp"))
     df = pd.DataFrame(df_list)
     df_list = []
     for model in MODEL_LIST:
         try:
+            results_list = [ALL_RESULTS[model][eval_mode]['mnli'][res] for res in ALL_RESULTS[model][eval_mode]['mnli']]
             accuracy = median([results['accuracy'] for results in results_list])
+            res = {
+                "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
+                "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
+                "Accuracy": accuracy,
+            }
+            df_list.append(res)
+        except:
+            print('Not found in model: {} for {}'.format(model, "mnli"))
     df = pd.DataFrame(df_list)
     df_list = []
     for model in MODEL_LIST:
         try:
+            results_list = [ALL_RESULTS[model][eval_mode]['qnli'][res] for res in ALL_RESULTS[model][eval_mode]['qnli']]
             accuracy = median([results['accuracy'] for results in results_list])
+            res = {
+                "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
+                "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
+                "Accuracy": accuracy,
+            }
+            df_list.append(res)
+        except:
+            print('Not found in model: {} for {}'.format(model, "qnli"))
     df = pd.DataFrame(df_list)
     # If there are any models that are the same, merge them
     df_list = []
     for model in MODEL_LIST:
         try:
+            results_list = [ALL_RESULTS[model][eval_mode]['wnli'][res] for res in ALL_RESULTS[model][eval_mode]['wnli']]
             accuracy = median([results['accuracy'] for results in results_list])
+            res = {
+                "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
+                "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
+                "Accuracy": accuracy,
+            }
+            df_list.append(res)
+        except:
+            print('Not found in model: {} for {}'.format(model, "wnli"))
     df = pd.DataFrame(df_list)
     # If there are any models that are the same, merge them
     return df
 WNLI_ZERO_SHOT = get_data_wnli(eval_mode="zero_shot")
 WNLI_FIVE_SHOT = get_data_wnli(eval_mode="five_shot")
 # =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
 # =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
     df_list = []
     for model in MODEL_LIST:
         try:
+            results_list = [ALL_RESULTS[model][eval_mode]['rte'][res] for res in ALL_RESULTS[model][eval_mode]['rte']]
             accuracy = median([results['accuracy'] for results in results_list])
+            res = {
+                "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
+                "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
+                "Accuracy": accuracy,
+            }
+            df_list.append(res)
+        except:
+            print('Not found in model: {} for {}'.format(model, "rte"))
     df = pd.DataFrame(df_list)
     # If there are any models that are the same, merge them
 RTE_FIVE_SHOT = get_data_rte(eval_mode="five_shot")
 # =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
 # =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
 def get_data_mrpc(eval_mode='zero_shot', fillna=True, rank=True):
     df_list = []
     for model in MODEL_LIST:
         try:
+            results_list = [ALL_RESULTS[model][eval_mode]['mrpc'][res] for res in ALL_RESULTS[model][eval_mode]['mrpc']]
             accuracy = median([results['accuracy'] for results in results_list])
+            res = {
+                "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
+                "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
+                "Accuracy": accuracy,
+            }
+            df_list.append(res)
+        except:
+            print('Not found in model: {} for {}'.format(model, "mrpc"))
     df = pd.DataFrame(df_list)
     # If there are any models that are the same, merge them
     - **Mode of Evaluation**: Zero-Shot, Five-Shot
     ### The following table shows the performance of the models on the SeaEval benchmark.
+    - For **Zero-Shot** performance, it is the median value from 5 distinct prompts shown on the above leaderboard to mitigate the influence of random variations induced by prompts.
+    - I am trying to evaluate the base models for five-shot performance and instruction-tuned models for zero-shot.
     - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
     """)
+        with gr.TabItem("Cultural Reasoning"):
             # dataset 3: SG_EVAL
             with gr.TabItem("SG_EVAL"):
                     """)
+        with gr.TabItem("FLORES-Translation"):
             # dataset 8:
                     """)
+        with gr.TabItem("Emotion"):
             # dataset 18:
             with gr.TabItem("ind_emotion"):
+        with gr.TabItem("Fundamental NLP Tasks"):
             # dataset