Spaces:

SeaEval
/

SeaEval_Leaderboard

Running

App Files Files Community

binwang commited on Dec 20, 2023

Commit

8024fdd

•

1 Parent(s): 5da889a

add a few new datasets

Browse files

Files changed (1) hide show

app.py +413 -0

app.py CHANGED Viewed

@@ -547,6 +547,275 @@ def get_data_ph_eval(eval_mode='zero_shot', fillna=True, rank=True):
 PH_EVAL_ZERO_SHOT = get_data_ph_eval(eval_mode="zero_shot")
 PH_EVAL_FIVE_SHOT = get_data_ph_eval(eval_mode="five_shot")
 # =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
 # =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
@@ -792,7 +1061,151 @@ with block:
                         )
     gr.Markdown(r"""

 PH_EVAL_ZERO_SHOT = get_data_ph_eval(eval_mode="zero_shot")
 PH_EVAL_FIVE_SHOT = get_data_ph_eval(eval_mode="five_shot")
+# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
+# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
+def get_data_sing2eng(eval_mode='zero_shot', fillna=True, rank=True):
+    df_list = []
+    for model in MODEL_LIST:
+        results_list = [ALL_RESULTS[model][eval_mode]['sing2eng'][res] for res in ALL_RESULTS[model][eval_mode]['sing2eng']]
+        try:
+            bleu_score = median([results['bleu_score'] for results in results_list])
+        except:
+            print(results_list)
+            bleu_score = -1
+        res = {
+            "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
+            "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
+            "BLEU": bleu_score,
+        }
+        df_list.append(res)
+    df = pd.DataFrame(df_list)
+    # If there are any models that are the same, merge them
+    # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
+    df = df.groupby("Model", as_index=False).first()
+    # Put 'Model' column first
+    #cols = sorted(list(df.columns))
+    cols = list(df.columns)
+    cols.insert(0, cols.pop(cols.index("Model")))
+    df = df[cols]
+    if rank:
+        df = add_rank(df, compute_average=True)
+    if fillna:
+        df.fillna("", inplace=True)
+    return df
+SING2ENG_ZERO_SHOT = get_data_sing2eng(eval_mode="zero_shot")
+SING2ENG_FIVE_SHOT = get_data_sing2eng(eval_mode="five_shot")
+# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
+# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
+def get_data_flores_ind2eng(eval_mode='zero_shot', fillna=True, rank=True):
+    df_list = []
+    for model in MODEL_LIST:
+        results_list = [ALL_RESULTS[model][eval_mode]['flores_ind2eng'][res] for res in ALL_RESULTS[model][eval_mode]['flores_ind2eng']]
+        try:
+            bleu_score = median([results['bleu_score'] for results in results_list])
+        except:
+            print(results_list)
+            bleu_score = -1
+        res = {
+            "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
+            "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
+            "BLEU": bleu_score,
+        }
+        df_list.append(res)
+    df = pd.DataFrame(df_list)
+    # If there are any models that are the same, merge them
+    # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
+    df = df.groupby("Model", as_index=False).first()
+    # Put 'Model' column first
+    #cols = sorted(list(df.columns))
+    cols = list(df.columns)
+    cols.insert(0, cols.pop(cols.index("Model")))
+    df = df[cols]
+    if rank:
+        df = add_rank(df, compute_average=True)
+    if fillna:
+        df.fillna("", inplace=True)
+    return df
+FLORES_IND2ENG_ZERO_SHOT = get_data_flores_ind2eng(eval_mode="zero_shot")
+FLORES_IND2ENG_FIVE_SHOT = get_data_flores_ind2eng(eval_mode="five_shot")
+# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
+# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
+def get_data_flores_vie2eng(eval_mode='zero_shot', fillna=True, rank=True):
+    df_list = []
+    for model in MODEL_LIST:
+        results_list = [ALL_RESULTS[model][eval_mode]['flores_vie2eng'][res] for res in ALL_RESULTS[model][eval_mode]['flores_vie2eng']]
+        try:
+            bleu_score = median([results['bleu_score'] for results in results_list])
+        except:
+            print(results_list)
+            bleu_score = -1
+        res = {
+            "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
+            "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
+            "BLEU": bleu_score,
+        }
+        df_list.append(res)
+    df = pd.DataFrame(df_list)
+    # If there are any models that are the same, merge them
+    # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
+    df = df.groupby("Model", as_index=False).first()
+    # Put 'Model' column first
+    #cols = sorted(list(df.columns))
+    cols = list(df.columns)
+    cols.insert(0, cols.pop(cols.index("Model")))
+    df = df[cols]
+    if rank:
+        df = add_rank(df, compute_average=True)
+    if fillna:
+        df.fillna("", inplace=True)
+    return df
+FLORES_VIE2ENG_ZERO_SHOT = get_data_flores_vie2eng(eval_mode="zero_shot")
+FLORES_VIE2ENG_FIVE_SHOT = get_data_flores_vie2eng(eval_mode="five_shot")
+# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
+# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
+def get_data_flores_zho2eng(eval_mode='zero_shot', fillna=True, rank=True):
+    df_list = []
+    for model in MODEL_LIST:
+        results_list = [ALL_RESULTS[model][eval_mode]['flores_zho2eng'][res] for res in ALL_RESULTS[model][eval_mode]['flores_zho2eng']]
+        try:
+            bleu_score = median([results['bleu_score'] for results in results_list])
+        except:
+            print(results_list)
+            bleu_score = -1
+        res = {
+            "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
+            "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
+            "BLEU": bleu_score,
+        }
+        df_list.append(res)
+    df = pd.DataFrame(df_list)
+    # If there are any models that are the same, merge them
+    # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
+    df = df.groupby("Model", as_index=False).first()
+    # Put 'Model' column first
+    #cols = sorted(list(df.columns))
+    cols = list(df.columns)
+    cols.insert(0, cols.pop(cols.index("Model")))
+    df = df[cols]
+    if rank:
+        df = add_rank(df, compute_average=True)
+    if fillna:
+        df.fillna("", inplace=True)
+    return df
+FLORES_ZHO2ENG_ZERO_SHOT = get_data_flores_zho2eng(eval_mode="zero_shot")
+FLORES_ZHO2ENG_FIVE_SHOT = get_data_flores_zho2eng(eval_mode="five_shot")
+# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
+# =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
+def get_data_flores_zsm2eng(eval_mode='zero_shot', fillna=True, rank=True):
+    df_list = []
+    for model in MODEL_LIST:
+        results_list = [ALL_RESULTS[model][eval_mode]['flores_zsm2eng'][res] for res in ALL_RESULTS[model][eval_mode]['flores_zsm2eng']]
+        try:
+            bleu_score = median([results['bleu_score'] for results in results_list])
+        except:
+            print(results_list)
+            bleu_score = -1
+        res = {
+            "Model Size (Params)": MODEL_TO_SIZE.get(model, ""),
+            "Model": make_clickable_model(model, link=ALL_RESULTS[model]["model_link"]),
+            "BLEU": bleu_score,
+        }
+        df_list.append(res)
+    df = pd.DataFrame(df_list)
+    # If there are any models that are the same, merge them
+    # E.g. if out["Model"] has the same value in two places, merge & take whichever one is not NaN else just take the first one
+    df = df.groupby("Model", as_index=False).first()
+    # Put 'Model' column first
+    #cols = sorted(list(df.columns))
+    cols = list(df.columns)
+    cols.insert(0, cols.pop(cols.index("Model")))
+    df = df[cols]
+    if rank:
+        df = add_rank(df, compute_average=True)
+    if fillna:
+        df.fillna("", inplace=True)
+    return df
+FLORES_ZSM2ENG_ZERO_SHOT = get_data_flores_zho2eng(eval_mode="zero_shot")
+FLORES_ZSM2ENG_FIVE_SHOT = get_data_flores_zho2eng(eval_mode="five_shot")
 # =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
 # =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =  =
                         )
+        # dataset 7:
+        with gr.TabItem("Singlish to English Translation"):
+            with gr.Row():
+                gr.Markdown("""
+                **SING2ENG Leaderboard** 🔮
+                - **Metric:** BLEU Avg.
+                - **Languages:** English
+                """)
+            with gr.TabItem("zero_shot"):
+                with gr.TabItem("Overall"):
+                    with gr.Row():
+                        gr.components.Dataframe(
+                            SING2ENG_ZERO_SHOT,
+                            datatype=["number", "markdown"] + ["number"] * len(SING2ENG_ZERO_SHOT.columns),
+                            type="pandas",
+                        )
+            with gr.TabItem("five_shot"):
+                with gr.TabItem("Overall"):
+                    with gr.Row():
+                        gr.components.Dataframe(
+                            SING2ENG_FIVE_SHOT,
+                            datatype=["number", "markdown"] + ["number"] * len(SING2ENG_FIVE_SHOT.columns),
+                            type="pandas",
+                        )
+        # dataset 8:
+        with gr.TabItem("FLORES Indonesian to English Translation"):
+            with gr.Row():
+                gr.Markdown("""
+                **flores_ind2eng Leaderboard** 🔮
+                - **Metric:** BLEU Avg.
+                - **Languages:** English
+                """)
+            with gr.TabItem("zero_shot"):
+                with gr.TabItem("Overall"):
+                    with gr.Row():
+                        gr.components.Dataframe(
+                            FLORES_IND2ENG_ZERO_SHOT,
+                            datatype=["number", "markdown"] + ["number"] * len(FLORES_IND2ENG_ZERO_SHOT.columns),
+                            type="pandas",
+                        )
+            with gr.TabItem("five_shot"):
+                with gr.TabItem("Overall"):
+                    with gr.Row():
+                        gr.components.Dataframe(
+                            FLORES_IND2ENG_FIVE_SHOT,
+                            datatype=["number", "markdown"] + ["number"] * len(FLORES_IND2ENG_FIVE_SHOT.columns),
+                            type="pandas",
+                        )
+        # dataset 9:
+        with gr.TabItem("FLORES Vitenamese to English Translation"):
+            with gr.Row():
+                gr.Markdown("""
+                **flores_vie2eng Leaderboard** 🔮
+                - **Metric:** BLEU Avg.
+                - **Languages:** English
+                """)
+            with gr.TabItem("zero_shot"):
+                with gr.TabItem("Overall"):
+                    with gr.Row():
+                        gr.components.Dataframe(
+                            FLORES_VIE2ENG_ZERO_SHOT,
+                            datatype=["number", "markdown"] + ["number"] * len(FLORES_VIE2ENG_ZERO_SHOT.columns),
+                            type="pandas",
+                        )
+            with gr.TabItem("five_shot"):
+                with gr.TabItem("Overall"):
+                    with gr.Row():
+                        gr.components.Dataframe(
+                            FLORES_VIE2ENG_FIVE_SHOT,
+                            datatype=["number", "markdown"] + ["number"] * len(FLORES_VIE2ENG_FIVE_SHOT.columns),
+                            type="pandas",
+                        )
+        # dataset 10:
+        with gr.TabItem("FLORES Chinese to English Translation"):
+            with gr.Row():
+                gr.Markdown("""
+                **flores_zho2eng Leaderboard** 🔮
+                - **Metric:** BLEU Avg.
+                - **Languages:** English
+                """)
+            with gr.TabItem("zero_shot"):
+                with gr.TabItem("Overall"):
+                    with gr.Row():
+                        gr.components.Dataframe(
+                            FLORES_ZHO2ENG_ZERO_SHOT,
+                            datatype=["number", "markdown"] + ["number"] * len(FLORES_ZHO2ENG_ZERO_SHOT.columns),
+                            type="pandas",
+                        )
+            with gr.TabItem("five_shot"):
+                with gr.TabItem("Overall"):
+                    with gr.Row():
+                        gr.components.Dataframe(
+                            FLORES_ZHO2ENG_FIVE_SHOT,
+                            datatype=["number", "markdown"] + ["number"] * len(FLORES_ZHO2ENG_FIVE_SHOT.columns),
+                            type="pandas",
+                        )
+        # dataset 10:
+        with gr.TabItem("FLORES Malay to English Translation"):
+            with gr.Row():
+                gr.Markdown("""
+                **flores_zsm2eng Leaderboard** 🔮
+                - **Metric:** BLEU Avg.
+                - **Languages:** English
+                """)
+            with gr.TabItem("zero_shot"):
+                with gr.TabItem("Overall"):
+                    with gr.Row():
+                        gr.components.Dataframe(
+                            FLORES_ZSM2ENG_ZERO_SHOT,
+                            datatype=["number", "markdown"] + ["number"] * len(FLORES_ZSM2ENG_ZERO_SHOT.columns),
+                            type="pandas",
+                        )
+            with gr.TabItem("five_shot"):
+                with gr.TabItem("Overall"):
+                    with gr.Row():
+                        gr.components.Dataframe(
+                            FLORES_ZSM2ENG_FIVE_SHOT,
+                            datatype=["number", "markdown"] + ["number"] * len(FLORES_ZSM2ENG_FIVE_SHOT.columns),
+                            type="pandas",
+                        )
     gr.Markdown(r"""