Gregor Betz commited on
Commit
13e8963
β€’
1 Parent(s): a1d2608

initial code upload

Browse files
Files changed (5) hide show
  1. .gitignore +1 -0
  2. README.md +1 -1
  3. app.py +109 -0
  4. backend/data.py +139 -0
  5. backend/envs.py +19 -0
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ .mypy_cache
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
  title: Open Cot Dashboard
3
- emoji: πŸ¦€
4
  colorFrom: yellow
5
  colorTo: blue
6
  sdk: gradio
 
1
  ---
2
  title: Open Cot Dashboard
3
+ emoji: πŸ“Š
4
  colorFrom: yellow
5
  colorTo: blue
6
  sdk: gradio
app.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr # type: ignore
2
+ import plotly.express as px # type: ignore
3
+
4
+ from backend.data import load_cot_data
5
+ from backend.envs import API, REPO_ID, TOKEN
6
+
7
+ logo1_url = "https://raw.githubusercontent.com/logikon-ai/cot-eval/main/assets/AI2_Logo_Square.png"
8
+ logo2_url = "https://raw.githubusercontent.com/logikon-ai/cot-eval/main/assets/logo_logikon_notext_withborder.png"
9
+ LOGOS = f'<div style="display: flex; justify-content: center;"><a href="https://allenai.org/"><img src="{logo1_url}" alt="AI2" style="width: 30vw; min-width: 20px; max-width: 60px;"></a> <a href="https://logikon.ai"><img src="{logo2_url}" alt="Logikon AI" style="width: 30vw; min-width: 20px; max-width: 60px; margin-left: 10px;"></a></div>'
10
+
11
+ TITLE = f'<h1 align="center" id="space-title"> Open CoT Dashboard</h1> {LOGOS}'
12
+
13
+ INTRODUCTION_TEXT = """
14
+ Baseline accuracies and marginal accuracy gains for specific models and CoT regimes from the [Open CoT Leaderboard](https://huggingface.co/spaces/logikon/open_cot_leaderboard).
15
+ """
16
+
17
+ def restart_space():
18
+ API.restart_space(repo_id=REPO_ID, token=TOKEN)
19
+
20
+ try:
21
+ df_cot_err, df_cot_regimes = load_cot_data()
22
+ except Exception:
23
+ restart_space()
24
+
25
+
26
+ def plot_evals(model_id, plotly_mode, request: gr.Request):
27
+ df = df_cot_err.copy()
28
+ if request and "model" in request.query_params:
29
+ model_param = request.query_params["model"]
30
+ if model_param in df.model.to_list():
31
+ model_id = model_param
32
+ df["selected"] = df_cot_err.model.apply(lambda x: "selected" if x==model_id else "-")
33
+ #df.sort_values(["selected", "model"], inplace=True, ascending=True) # has currently no effect with px.scatter
34
+ template = "plotly_dark" if plotly_mode=="dark" else "plotly"
35
+ fig = px.scatter(df, x="base accuracy", y="marginal acc. gain", color="selected", symbol="model",
36
+ facet_col="task", facet_col_wrap=3,
37
+ category_orders={"selected": ["selected", "-"]},
38
+ color_discrete_sequence=["Orange", "Gray"],
39
+ template=template,
40
+ error_y="acc_gain-err", hover_data=['model', "cot accuracy"],
41
+ width=1200, height=700)
42
+
43
+ fig.update_layout(
44
+ title={"automargin": True},
45
+ )
46
+ return fig, model_id
47
+
48
+ def get_model_table(model_id):
49
+
50
+ def make_pretty(styler):
51
+ styler.hide(axis="index")
52
+ styler.format(precision=1),
53
+ styler.background_gradient(
54
+ axis=None,
55
+ subset=["acc_base", "acc_cot"],
56
+ vmin=20, vmax=100, cmap="YlGnBu"
57
+ )
58
+ styler.background_gradient(
59
+ axis=None,
60
+ subset=["acc_gain"],
61
+ vmin=-20, vmax=20, cmap="coolwarm"
62
+ )
63
+ styler.set_table_styles({
64
+ 'task': [{'selector': '',
65
+ 'props': [('font-weight', 'bold')]}],
66
+ 'B': [{'selector': 'td',
67
+ 'props': 'color: blue;'}]
68
+ }, overwrite=False)
69
+ return styler
70
+
71
+ df_cot_model = df_cot_regimes[df_cot_regimes.model.eq(model_id)][['task', 'cot_chain', 'best_of',
72
+ 'temperature', 'top_k', 'top_p', 'acc_base', 'acc_cot', 'delta_abs']]
73
+
74
+ df_cot_model = df_cot_model \
75
+ .rename(columns={"temperature": "temp"}) \
76
+ .replace({'cot_chain': 'ReflectBeforeRun'}, "Reflect") \
77
+ .sort_values(["task", "cot_chain"]) \
78
+ .reset_index(drop=True)
79
+
80
+ return df_cot_model.style.pipe(make_pretty)
81
+
82
+ def styled_model_table(model_id, request: gr.Request):
83
+ if request and "model" in request.query_params:
84
+ model_param = request.query_params["model"]
85
+ if model_param in df_cot_regimes.model.to_list():
86
+ model_id = model_param
87
+ return get_model_table(model_id)
88
+
89
+
90
+ demo = gr.Blocks()
91
+
92
+ with demo:
93
+
94
+ gr.HTML(TITLE)
95
+ gr.Markdown(INTRODUCTION_TEXT)
96
+ with gr.Row():
97
+ model_list = gr.Dropdown(list(df_cot_err.model.unique()), value="allenai/tulu-2-70b", label="Model", scale=2)
98
+ plotly_mode = gr.Radio(["dark","light"], value="dark", label="Plot theme", scale=1)
99
+ submit = gr.Button("Update", scale=1)
100
+ table = gr.DataFrame()
101
+ plot = gr.Plot(label="evals")
102
+
103
+
104
+ submit.click(plot_evals, [model_list, plotly_mode], [plot, model_list])
105
+ submit.click(styled_model_table, model_list, table)
106
+ demo.load(plot_evals, [model_list, plotly_mode], [plot, model_list])
107
+ demo.load(styled_model_table, model_list, table)
108
+
109
+ demo.launch()
backend/data.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import glob
2
+ import json
3
+
4
+ import datasets # type: ignore
5
+ from huggingface_hub import snapshot_download # type: ignore
6
+ import pandas as pd # type: ignore
7
+
8
+ from backend.envs import EVAL_DATASET, TRACES_DATASET, TOKEN, EVAL_RESULTS_PATH
9
+
10
+
11
+ SUBSETS = ["base","cot","orig"]
12
+
13
+
14
+ def load_cot_data():
15
+
16
+ ####
17
+ # Load the evaluation results data
18
+ ####
19
+
20
+ # download raw data
21
+ snapshot_download(
22
+ repo_id=EVAL_DATASET,
23
+ revision="main",
24
+ local_dir=EVAL_RESULTS_PATH,
25
+ repo_type="dataset",
26
+ max_workers=60,
27
+ token=TOKEN
28
+ )
29
+
30
+ # get all models for which results are stored
31
+ models = []
32
+ for path in glob.glob(f"{EVAL_RESULTS_PATH}/data/*/*", recursive=False):
33
+ models.append(path.replace(f"{EVAL_RESULTS_PATH}/data/",""))
34
+
35
+ # load the evaluation results and create a dataframe
36
+ results = []
37
+ for model in models:
38
+ for subset in SUBSETS:
39
+ result_files = glob.glob(f"{EVAL_RESULTS_PATH}/data/{model}/{subset}/**/*.json", recursive=True)
40
+ for json_filepath in result_files:
41
+ with open(json_filepath) as fp:
42
+ data = json.load(fp)
43
+ if "results" in data.keys():
44
+ for k,v in data["results"].items():
45
+ record = v.copy()
46
+ record["model"] = model
47
+ record["subset"] = subset
48
+ results.append(record)
49
+
50
+ df_results = pd.DataFrame(results)
51
+ del results
52
+
53
+ # postprocess task/config data
54
+ def split_alias(alias: str) -> pd.Series:
55
+ if alias[-5:]=="_base":
56
+ alias = alias[:-5]
57
+ elif alias[-4:]=="_cot":
58
+ alias = alias[:-4]
59
+
60
+ if "_" not in alias:
61
+ task = alias
62
+ config = ""
63
+ else:
64
+ config, task = alias.split("_")
65
+
66
+ return pd.Series({"task": task, "config": config})
67
+
68
+ df_results = pd.concat([df_results, df_results.alias.apply(split_alias)], axis=1)
69
+
70
+ # baseline accuracies in separete df
71
+ df_baseline = df_results[df_results.subset.eq("base")].groupby(["model","task"])[["acc,none"]].mean()
72
+
73
+ # build cot eval df with baseline accuracies in separate column
74
+ df_tmp1 = df_results[df_results.subset.eq("cot")].sort_values(by=["model","task","config"])
75
+ df_tmp1.reset_index(inplace=True, drop=True)
76
+
77
+ df_cot = df_tmp1[["model","task","config"]].copy()
78
+ df_cot["acc_cot"] = df_tmp1["acc,none"]
79
+ df_cot["acc_base"] = df_cot.apply(lambda row: df_baseline.loc[(row.model, row.task)]["acc,none"], axis=1)
80
+
81
+ df_cot["acc_gain"] = df_cot.acc_cot - df_cot.acc_base
82
+ df_cot["delta_rel"] = (df_cot.acc_cot - df_cot.acc_base)/df_cot.acc_base
83
+
84
+ # average eval results for all tasks in extra df
85
+ df_cot_avg = df_cot.groupby(["model","config"]).mean(numeric_only=True).reset_index()
86
+ df_cot_avg["task"] = "all"
87
+
88
+ # add average results to cot df
89
+ df_cot = pd.concat([df_cot_avg, df_cot], ignore_index=True)
90
+
91
+
92
+ ####
93
+ # Load the traces data
94
+ ####
95
+
96
+ # load traces data and extract configs
97
+ dataset = datasets.load_dataset(TRACES_DATASET, split="test", token=TOKEN)
98
+ dataset = dataset.select_columns(["config_data"])
99
+ df_cottraces = pd.DataFrame({"config_data": dataset["config_data"]})
100
+ del dataset
101
+ config_data = []
102
+ for data in df_cottraces.config_data.to_list():
103
+ config_data.append(dict(data))
104
+ del df_cottraces
105
+ df_cotconfigs = pd.DataFrame(config_data)
106
+ df_cotconfigs.drop_duplicates(inplace=True, ignore_index=True)
107
+ df_cotconfigs
108
+
109
+ # add cot configs data to df_cot
110
+ def select_config_data(row):
111
+ df_selected = df_cotconfigs[df_cotconfigs.name.eq(row.config) & df_cotconfigs.model.eq(row.model)]
112
+ if len(df_selected) == 0:
113
+ print(f"Config {row.config} not found for model {row.model}")
114
+ return None
115
+ return df_selected.drop(columns=["name", "model", "task"]).iloc[0]
116
+
117
+ df_cot = pd.concat(
118
+ [
119
+ df_cot,
120
+ df_cot.apply(select_config_data, axis=1)
121
+ ],
122
+ axis=1
123
+ )
124
+
125
+ # accuracy values in percent
126
+ for col in ['acc_base', 'acc_cot', 'acc_gain']:
127
+ df_cot[col] = 100 * df_cot[col]
128
+
129
+ ####
130
+ # Create error dataframe
131
+ ####
132
+
133
+ df_cot_err = df_cot.groupby(["model","task"]).agg({'acc_gain': ['mean', 'min', 'max'], "acc_base": "mean", "acc_cot": "mean"})
134
+ df_cot_err.columns = ['-'.join(col).strip() for col in df_cot_err.columns.values]
135
+ df_cot_err["acc_gain-err"] = 0.5 * (df_cot_err["acc_gain-max"] - df_cot_err["acc_gain-min"])
136
+ df_cot_err.reset_index(inplace=True)
137
+ df_cot_err.rename(columns={"acc_base-mean": "base accuracy", "acc_cot-mean": "cot accuracy", "acc_gain-mean": "marginal acc. gain"}, inplace=True)
138
+
139
+ return df_cot_err, df_cot
backend/envs.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from huggingface_hub import HfApi # type: ignore
4
+
5
+
6
+ # clone / pull the lmeh eval data
7
+ TOKEN = os.environ.get("TOKEN", None)
8
+
9
+ OWNER = "cot-leaderboard"
10
+ REPO_ID = f"{OWNER}/open_cot_dashboard"
11
+ EVAL_DATASET = f"{OWNER}/cot-eval-results"
12
+ TRACES_DATASET = f"{OWNER}/cot-eval-traces-2.0"
13
+
14
+ CACHE_PATH=os.getenv("HF_HOME", ".")
15
+
16
+ # Local caches
17
+ EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "cot-eval-results")
18
+
19
+ API = HfApi(token=TOKEN)