Spaces:

cot-leaderboard
/

open-cot-dashboard

Running

+import gradio as gr  # type: ignore
+import plotly.express as px  # type: ignore
+from backend.data import load_cot_data
+from backend.envs import API, REPO_ID, TOKEN
+logo1_url = "https://raw.githubusercontent.com/logikon-ai/cot-eval/main/assets/AI2_Logo_Square.png"
+logo2_url = "https://raw.githubusercontent.com/logikon-ai/cot-eval/main/assets/logo_logikon_notext_withborder.png"
+LOGOS = f'<div style="display: flex; justify-content: center;"><a href="https://allenai.org/"><img src="{logo1_url}" alt="AI2" style="width: 30vw; min-width: 20px; max-width: 60px;"></a> <a href="https://logikon.ai"><img src="{logo2_url}" alt="Logikon AI" style="width: 30vw; min-width: 20px; max-width: 60px; margin-left: 10px;"></a></div>'
+TITLE = f'<h1 align="center" id="space-title"> Open CoT Dashboard</h1> {LOGOS}'
+INTRODUCTION_TEXT = """
+Baseline accuracies and marginal accuracy gains for specific models and CoT regimes from the [Open CoT Leaderboard](https://huggingface.co/spaces/logikon/open_cot_leaderboard).
+"""
+def restart_space():
+    API.restart_space(repo_id=REPO_ID, token=TOKEN)
+try:
+    df_cot_err, df_cot_regimes = load_cot_data()
+except Exception:
+    restart_space()
+def plot_evals(model_id, plotly_mode, request: gr.Request):
+    df = df_cot_err.copy()
+    if request and "model" in request.query_params:
+        model_param = request.query_params["model"]
+        if model_param in df.model.to_list():
+            model_id = model_param
+    df["selected"] = df_cot_err.model.apply(lambda x: "selected" if x==model_id else "-")
+    #df.sort_values(["selected", "model"], inplace=True, ascending=True)  # has currently no effect with px.scatter
+    template = "plotly_dark" if plotly_mode=="dark" else "plotly"
+    fig = px.scatter(df, x="base accuracy", y="marginal acc. gain", color="selected", symbol="model",
+                    facet_col="task", facet_col_wrap=3,
+                    category_orders={"selected": ["selected", "-"]},
+                    color_discrete_sequence=["Orange", "Gray"],
+                    template=template,
+                    error_y="acc_gain-err", hover_data=['model', "cot accuracy"],
+                    width=1200, height=700)
+    fig.update_layout(
+        title={"automargin": True},
+    )
+    return fig, model_id
+def get_model_table(model_id):
+    def make_pretty(styler):
+        styler.hide(axis="index")
+        styler.format(precision=1),
+        styler.background_gradient(
+            axis=None,
+            subset=["acc_base", "acc_cot"],
+            vmin=20, vmax=100, cmap="YlGnBu"
+        )
+        styler.background_gradient(
+            axis=None,
+            subset=["acc_gain"],
+            vmin=-20, vmax=20, cmap="coolwarm"
+        )
+        styler.set_table_styles({
+            'task': [{'selector': '',
+                  'props': [('font-weight', 'bold')]}],
+            'B': [{'selector': 'td',
+                  'props': 'color: blue;'}]
+        }, overwrite=False)
+        return styler
+    df_cot_model = df_cot_regimes[df_cot_regimes.model.eq(model_id)][['task', 'cot_chain', 'best_of',
+          'temperature', 'top_k', 'top_p', 'acc_base', 'acc_cot', 'delta_abs']]
+    df_cot_model = df_cot_model \
+      .rename(columns={"temperature": "temp"}) \
+      .replace({'cot_chain': 'ReflectBeforeRun'}, "Reflect") \
+      .sort_values(["task", "cot_chain"]) \
+      .reset_index(drop=True)
+    return df_cot_model.style.pipe(make_pretty)
+def styled_model_table(model_id, request: gr.Request):
+    if request and "model" in request.query_params:
+        model_param = request.query_params["model"]
+        if model_param in df_cot_regimes.model.to_list():
+            model_id = model_param
+    return get_model_table(model_id)
+demo = gr.Blocks()
+with demo:
+    gr.HTML(TITLE)
+    gr.Markdown(INTRODUCTION_TEXT)
+    with gr.Row():
+        model_list = gr.Dropdown(list(df_cot_err.model.unique()), value="allenai/tulu-2-70b", label="Model", scale=2)
+        plotly_mode = gr.Radio(["dark","light"], value="dark", label="Plot theme", scale=1)
+        submit = gr.Button("Update", scale=1)
+    table = gr.DataFrame()
+    plot = gr.Plot(label="evals")
+    submit.click(plot_evals, [model_list, plotly_mode], [plot, model_list])
+    submit.click(styled_model_table, model_list, table)
+    demo.load(plot_evals, [model_list, plotly_mode], [plot, model_list])
+    demo.load(styled_model_table, model_list, table)
+demo.launch()

backend/data.py ADDED Viewed

	@@ -0,0 +1,139 @@

+import glob
+import json
+import datasets  # type: ignore
+from huggingface_hub import snapshot_download  # type: ignore
+import pandas as pd  # type: ignore
+from backend.envs import EVAL_DATASET, TRACES_DATASET, TOKEN, EVAL_RESULTS_PATH
+SUBSETS = ["base","cot","orig"]
+def load_cot_data():
+    ####
+    # Load the evaluation results data
+    ####
+    # download raw data
+    snapshot_download(
+        repo_id=EVAL_DATASET,
+        revision="main",
+        local_dir=EVAL_RESULTS_PATH,
+        repo_type="dataset",
+        max_workers=60,
+        token=TOKEN
+    )
+    # get all models for which results are stored
+    models = []
+    for path in glob.glob(f"{EVAL_RESULTS_PATH}/data/*/*", recursive=False):
+        models.append(path.replace(f"{EVAL_RESULTS_PATH}/data/",""))
+    # load the evaluation results and create a dataframe
+    results = []
+    for model in models:
+        for subset in SUBSETS:
+            result_files = glob.glob(f"{EVAL_RESULTS_PATH}/data/{model}/{subset}/**/*.json", recursive=True)
+            for json_filepath in result_files:
+                with open(json_filepath) as fp:
+                    data = json.load(fp)
+                if "results" in data.keys():
+                    for k,v in data["results"].items():
+                        record = v.copy()
+                        record["model"] = model
+                        record["subset"] = subset
+                        results.append(record)
+    df_results = pd.DataFrame(results)
+    del results
+    # postprocess task/config data
+    def split_alias(alias: str) -> pd.Series:
+        if alias[-5:]=="_base":
+            alias = alias[:-5]
+        elif alias[-4:]=="_cot":
+            alias = alias[:-4]
+        if "_" not in alias:
+            task = alias
+            config = ""
+        else:
+            config, task = alias.split("_")
+        return pd.Series({"task": task, "config": config})
+    df_results = pd.concat([df_results, df_results.alias.apply(split_alias)], axis=1)
+    # baseline accuracies in separete df
+    df_baseline = df_results[df_results.subset.eq("base")].groupby(["model","task"])[["acc,none"]].mean()
+    # build cot eval df with baseline accuracies in separate column
+    df_tmp1 = df_results[df_results.subset.eq("cot")].sort_values(by=["model","task","config"])
+    df_tmp1.reset_index(inplace=True, drop=True)
+    df_cot = df_tmp1[["model","task","config"]].copy()
+    df_cot["acc_cot"] = df_tmp1["acc,none"]
+    df_cot["acc_base"] = df_cot.apply(lambda row: df_baseline.loc[(row.model, row.task)]["acc,none"], axis=1)
+    df_cot["acc_gain"] = df_cot.acc_cot - df_cot.acc_base
+    df_cot["delta_rel"] = (df_cot.acc_cot - df_cot.acc_base)/df_cot.acc_base
+    # average eval results for all tasks in extra df
+    df_cot_avg = df_cot.groupby(["model","config"]).mean(numeric_only=True).reset_index()
+    df_cot_avg["task"] = "all"
+    # add average results to cot df
+    df_cot = pd.concat([df_cot_avg, df_cot], ignore_index=True)
+    ####
+    # Load the traces data
+    ####
+    # load traces data and extract configs
+    dataset = datasets.load_dataset(TRACES_DATASET, split="test", token=TOKEN)
+    dataset = dataset.select_columns(["config_data"])
+    df_cottraces = pd.DataFrame({"config_data": dataset["config_data"]})
+    del dataset
+    config_data = []
+    for data in df_cottraces.config_data.to_list():
+        config_data.append(dict(data))
+    del df_cottraces
+    df_cotconfigs = pd.DataFrame(config_data)
+    df_cotconfigs.drop_duplicates(inplace=True, ignore_index=True)
+    df_cotconfigs
+    # add cot configs data to df_cot
+    def select_config_data(row):
+        df_selected = df_cotconfigs[df_cotconfigs.name.eq(row.config) & df_cotconfigs.model.eq(row.model)]
+        if len(df_selected) == 0:
+            print(f"Config {row.config} not found for model {row.model}")
+            return None
+        return df_selected.drop(columns=["name", "model", "task"]).iloc[0]
+    df_cot = pd.concat(
+        [
+            df_cot,
+            df_cot.apply(select_config_data, axis=1)
+        ],
+        axis=1
+    )
+    # accuracy values in percent
+    for col in ['acc_base', 'acc_cot', 'acc_gain']:
+        df_cot[col] = 100 * df_cot[col]
+    ####
+    # Create error dataframe
+    ####
+    df_cot_err = df_cot.groupby(["model","task"]).agg({'acc_gain': ['mean', 'min', 'max'], "acc_base": "mean", "acc_cot": "mean"})
+    df_cot_err.columns = ['-'.join(col).strip() for col in df_cot_err.columns.values]
+    df_cot_err["acc_gain-err"] = 0.5 * (df_cot_err["acc_gain-max"] - df_cot_err["acc_gain-min"])
+    df_cot_err.reset_index(inplace=True)
+    df_cot_err.rename(columns={"acc_base-mean": "base accuracy", "acc_cot-mean": "cot accuracy", "acc_gain-mean": "marginal acc. gain"}, inplace=True)
+    return df_cot_err, df_cot

backend/envs.py ADDED Viewed

	@@ -0,0 +1,19 @@

+import os
+from huggingface_hub import HfApi  # type: ignore
+# clone / pull the lmeh eval data
+TOKEN = os.environ.get("TOKEN", None)
+OWNER = "cot-leaderboard"
+REPO_ID = f"{OWNER}/open_cot_dashboard"
+EVAL_DATASET = f"{OWNER}/cot-eval-results"
+TRACES_DATASET = f"{OWNER}/cot-eval-traces-2.0"
+CACHE_PATH=os.getenv("HF_HOME", ".")
+# Local caches
+EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "cot-eval-results")
+API = HfApi(token=TOKEN)